> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Kimi-K2.5

export const KimiK25Deployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h200',
        label: 'H200',
        default: true
      }, {
        id: 'b300',
        label: 'B300',
        default: false
      }, {
        id: 'mi300x',
        label: 'MI300X',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi350x',
        label: 'MI350X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      getDynamicItems: values => {
        const hw = values.hardware;
        const isB300 = hw === 'b300';
        return [{
          id: 'int4',
          label: 'INT4',
          subtitle: 'initial model',
          default: true
        }, {
          id: 'nvfp4',
          label: 'NVFP4',
          subtitle: 'Blackwell only',
          default: false,
          disabled: !isB300,
          disabledReason: 'NVFP4 only on B300'
        }];
      }
    },
    reasoning: {
      name: 'reasoning',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: false
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: true
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: false
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: true
      }]
    },
    dpattention: {
      name: 'dpattention',
      title: 'DP Attention',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        subtitle: 'Low Latency',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        subtitle: 'High Throughput',
        default: false
      }]
    },
    speculative: {
      name: 'speculative',
      title: 'Speculative Decoding',
      condition: values => values.hardware === 'h200' || values.hardware === 'b300',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    }
  };
  const modelConfigs = {
    h200: {
      tp: 8
    },
    b300: {
      tp: 8
    },
    mi300x: {
      tp: 4
    },
    mi325x: {
      tp: 4
    },
    mi350x: {
      tp: 4
    },
    mi355x: {
      tp: 4
    }
  };
  const resolveItems = (option, values) => {
    if (typeof option.getDynamicItems === 'function') return option.getDynamicItems(values);
    return option.items;
  };
  const getInitialState = () => {
    const initialState = {};
    for (const [key, option] of Object.entries(options)) {
      const items = resolveItems(option, initialState);
      const def = items.find(i => i.default && !i.disabled) || items.find(i => !i.disabled) || items[0];
      initialState[key] = def.id;
    }
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  useEffect(() => {
    setValues(prev => {
      const next = {
        ...prev
      };
      for (const [key, option] of Object.entries(options)) {
        if (typeof option.getDynamicItems !== 'function') continue;
        const items = option.getDynamicItems(next);
        const current = items.find(i => i.id === next[key]);
        if (!current || current.disabled) {
          const fallback = items.find(i => i.default && !i.disabled) || items.find(i => !i.disabled);
          if (fallback) next[key] = fallback.id;
        }
      }
      return next;
    });
  }, [values.hardware]);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const generateCommand = () => {
    const {hardware, quantization, speculative} = values;
    const isAMD = hardware === 'mi300x' || hardware === 'mi325x' || hardware === 'mi350x' || hardware === 'mi355x';
    if (quantization === 'nvfp4' && hardware !== 'b300') {
      return '# NVFP4 quantization is only supported on NVIDIA Blackwell GPUs (B300)';
    }
    if (speculative === 'enabled' && hardware !== 'h200' && hardware !== 'b300') {
      return '# Speculative Decoding for Kimi-K2.5 is only supported on H200 and B300';
    }
    const modelName = quantization === 'nvfp4' ? 'nvidia/Kimi-K2.5-NVFP4' : 'moonshotai/Kimi-K2.5';
    const hwConfig = modelConfigs[hardware];
    const tpValue = hwConfig.tp;
    let cmd = '';
    if (isAMD) {
      cmd += 'SGLANG_USE_AITER=1 SGLANG_ROCM_FUSED_DECODE_MLA=0 ';
    }
    if (speculative === 'enabled') {
      cmd += 'SGLANG_ENABLE_SPEC_V2=1 ';
    }
    if (isAMD || speculative === 'enabled') {
      cmd += '\\\n';
    }
    cmd += 'sglang serve \\\n';
    cmd += `  --model-path ${modelName}`;
    cmd += ` \\\n  --tp ${tpValue}`;
    cmd += ' \\\n  --trust-remote-code';
    if (values.dpattention === 'enabled') {
      cmd += ` \\\n  --dp ${tpValue} \\\n  --enable-dp-attention`;
    }
    if (values.reasoning === 'enabled') {
      cmd += ' \\\n  --reasoning-parser kimi_k2';
    }
    if (values.toolcall === 'enabled') {
      cmd += ' \\\n  --tool-call-parser kimi_k2';
    }
    if (speculative === 'enabled') {
      cmd += ' \\\n  --speculative-algorithm EAGLE3 \\\n  --speculative-num-steps 3 \\\n  --speculative-eagle-topk 1 \\\n  --speculative-num-draft-tokens 4 \\\n  --speculative-draft-model-path lightseekorg/kimi-k2.5-eagle3';
    }
    if (isAMD) {
      cmd += ' \\\n  --kv-cache-dtype fp8_e4m3';
    }
    cmd += ' \\\n  --host 0.0.0.0 \\\n  --port 30000';
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.4
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (typeof option.condition === 'function' && !option.condition(values)) return null;
    const items = resolveItems(option, values);
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = !!item.disabled;
      return <label key={item.id} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }} title={item.disabledReason || ''}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                    {item.label}
                    {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>{item.subtitle}</small>}
                  </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[Kimi-K2.5](https://huggingface.co/moonshotai/Kimi-K2.5) is an open-source, native multimodal agentic model by Moonshot AI, built through continual pretraining on approximately 15 trillion mixed visual and text tokens atop Kimi-K2-Base. It seamlessly integrates vision and language understanding with advanced agentic capabilities, instant and thinking modes.

**Key Features:**

* **Native Multimodality**: Pre-trained on vision-language tokens, K2.5 excels in visual knowledge, cross-modal reasoning, and agentic tool use grounded in visual inputs.
* **Coding with Vision**: K2.5 generates code from visual specifications (UI designs, video workflows) and autonomously orchestrates tools for visual data processing.
* **Agent Swarm**: K2.5 transitions from single-agent scaling to a self-directed, coordinated swarm-like execution scheme. It decomposes complex tasks into parallel sub-tasks executed by dynamically instantiated, domain-specific agents.
* **Speculative Decoding**: EAGLE-based speculative decoding support for lower latency.

**Available Models**:

* INT4 (Initial Released): [moonshotai/Kimi-K2.5](https://huggingface.co/moonshotai/Kimi-K2.5)
* NVFP4 (4-bit quantized): [nvidia/Kimi-K2.5-NVFP4](https://huggingface.co/nvidia/Kimi-K2.5-NVFP4)

For details, see [official documentation](https://huggingface.co/moonshotai/Kimi-K2.5) and [deployment guidance](https://huggingface.co/moonshotai/Kimi-K2.5/blob/main/docs/deploy_guidance.md).

## 2. SGLang Installation

Refer to the [official SGLang installation guide](../../../docs/get-started/install).

## 3. Model Deployment

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, deployment strategy, and capabilities.

<KimiK25Deployment />

### 3.2 Configuration Tips

* **Memory**: Requires GPUs with >=140GB each. Supported platforms: H200 (8x, TP=8), B300 (8x, TP=8), MI300X/MI325X (4x, TP=4), MI350X/MI355X (4x, TP=4). Use `--context-length 128000` to conserve memory.
* **AMD GPU TP Constraint**: On AMD GPUs, TP must be \<= 4 (not 8). Kimi-K2.5 has 64 attention heads; the AITER MLA kernel requires `heads_per_gpu % 16 == 0`. With TP=4, each GPU gets 16 heads (valid). With TP=8, each GPU gets 8 heads (invalid).
* **AMD Docker Image**: Use `lmsysorg/sglang:v0.5.9-rocm700-mi35x` for MI350X/MI355X and `lmsysorg/sglang:v0.5.9-rocm700-mi30x` for MI300X/MI325X. The ROCm 7.2 images (`rocm720`) have an AITER compatibility issue.
* **DP Attention**: Enable with `--dp <N> --enable-dp-attention` for production throughput. A common choice is to set `--dp` equal to `--tp`, but this is not required.
* **Reasoning Parser**: Add `--reasoning-parser kimi_k2` to separate thinking and content in model outputs.
* **Tool Call Parser**: Add `--tool-call-parser kimi_k2` for structured tool calls.

## 4. Model Invocation

### 4.1 Basic Usage

See [Basic API Usage](../../../docs/basic_usage/send_request).

### 4.2 Advanced Usage

#### 4.2.1 Multimodal (Vision + Text) Input

Kimi-K2.5 supports native multimodal input with images:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

response = client.chat.completions.create(
    model="moonshotai/Kimi-K2.5",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png"
                    }
                },
                {
                    "type": "text",
                    "text": "What is in this image? Describe it in detail."
                }
            ]
        }
    ]
)

print(response.choices[0].message.content)
```

**Output Example:**

```text Output theme={null}
This image shows a **receipt from Auntie Anne's** (a pretzel franchise restaurant).

## Key Details:

**Item Purchased:**
- **CINNAMON SUGAR** - 1 unit x 17,000 = **17,000**

**Payment Summary:**
- **SUB TOTAL:** 17,000
- **GRAND TOTAL:** 17,000
- **CASH IDR:** 20,000 (Indonesian Rupiah)
- **CHANGE DUE:** 3,000

## Context:
The receipt indicates a transaction in **Indonesian Rupiah (IDR)**. A customer purchased one Cinnamon Sugar pretzel for 17,000 IDR, paid with a 20,000 IDR note, and received 3,000 IDR in change.

The top of the receipt shows the Auntie Anne's logo (a heart-shaped pretzel with a halo), and some text appears blurred for privacy, likely obscuring the store location, date, and transaction number. The receipt is printed on white thermal paper.
```

#### 4.2.2 Reasoning Output

Kimi-K2.5 supports both thinking mode (default) and instant mode.

**Thinking Mode (default)** -- reasoning content is automatically separated:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

response = client.chat.completions.create(
    model="moonshotai/Kimi-K2.5",
    messages=[
        {"role": "user", "content": "Which one is bigger, 9.11 or 9.9? Think carefully."}
    ]
)

print("====== Reasoning Content (Thinking Mode) ======")
print(response.choices[0].message.reasoning_content)
print("====== Response (Thinking Mode) ======")
print(response.choices[0].message.content)
```

**Instant Mode (thinking off)** -- disable thinking for faster responses:

```python Example theme={null}
response = client.chat.completions.create(
    model="moonshotai/Kimi-K2.5",
    messages=[
        {"role": "user", "content": "Which one is bigger, 9.11 or 9.9? Think carefully."}
    ],
    extra_body={"chat_template_kwargs": {"thinking": False}}
)

print("====== Response (Instant Mode) ======")
print(response.choices[0].message.content)
```

**Output Example:**

```text Output theme={null}
====== Reasoning Content (Thinking Mode) ======
The user is asking which number is bigger: 9.11 or 9.9.

At first glance, someone might think 9.11 is bigger because 11 > 9, but that's incorrect because we're dealing with decimal numbers, not whole numbers.

Let me compare them properly:
- 9.9 = 9.90
- 9.11

When comparing decimals, we look at each place value from left to right:
- Units place: 9 = 9 (tie)
- Tenths place: 9 vs 1

Since 9 > 1, we have 9.9 > 9.11.

Alternatively, we can think of it as:
- 9.9 = 9 + 9/10 = 9 + 0.9 = 9.90
- 9.11 = 9 + 11/100 = 9 + 0.11

Since 0.90 > 0.11, then 9.9 > 9.11.

So the answer is clearly 9.9 is bigger.

The "think carefully" hint suggests the user is trying to catch the common error where people compare 11 and 9 as whole numbers rather than understanding decimal place value (tenths vs hundredths).

I should explain this clearly to avoid confusion.
====== Response (Thinking Mode) ======
**9.9 is bigger.**

Here's why this can be tricky: Many people instinctively compare 11 and 9 and think "11 is bigger than 9," but that's comparing the wrong place values.

When comparing decimals, align them by place value:
- 9.9 = 9.**90**
- 9.11 = 9.**11**

After the decimal point:
- The first digit (tenths place): **9** vs **1**
- Since 9 > 1, we stop there. **9.9 is larger.**

Think of it as money:
- $9.90 (nine dollars and ninety cents)
- $9.11 (nine dollars and eleven cents)

$9.90 is clearly more than $9.11.
====== Response (Instant Mode) ======
 Let me think through this carefully.

**9.9 is bigger than 9.11**

Here's why: When comparing decimals, we need to align them by their decimal places:

- 9.9 = 9.90
- 9.11 = 9.11

Now comparing:
- The whole number parts are equal (9 = 9)
- Comparing tenths: **9 > 1**

So 9.90 > 9.11

A common mistake is thinking 11 hundredths is larger than 9 tenths, but 9 tenths = 90 hundredths, which is clearly larger than 11 hundredths.
```

#### 4.2.3 Tool Calling

Kimi-K2.5 supports tool calling capabilities for agentic tasks:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

response = client.chat.completions.create(
    model="moonshotai/Kimi-K2.5",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    stream=True
)

# Process streaming response
tool_calls_accumulator = {}

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        if hasattr(delta, 'tool_calls') and delta.tool_calls:
            for tool_call in delta.tool_calls:
                index = tool_call.index
                if index not in tool_calls_accumulator:
                    tool_calls_accumulator[index] = {'name': None, 'arguments': ''}
                if tool_call.function:
                    if tool_call.function.name:
                        tool_calls_accumulator[index]['name'] = tool_call.function.name
                    if tool_call.function.arguments:
                        tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments

        if delta.content:
            print(delta.content, end="", flush=True)

for index, tool_call in sorted(tool_calls_accumulator.items()):
    print(f"Tool Call: {tool_call['name']}")
    print(f"  Arguments: {tool_call['arguments']}")
```

**Output Example:**

```text Output theme={null}
Tool Call: get_weather
  Arguments: {"location": "Beijing"}
```

**Handling Tool Call Results:**

```python Example theme={null}
# Send tool result back to the model
messages = [
    {"role": "user", "content": "What's the weather in Beijing?"},
    {
        "role": "assistant",
        "content": None,
        "tool_calls": [{
            "id": "call_123",
            "type": "function",
            "function": {
                "name": "get_weather",
                "arguments": '{"location": "Beijing", "unit": "celsius"}'
            }
        }]
    },
    {
        "role": "tool",
        "tool_call_id": "call_123",
        "content": "The weather in Beijing is 22°C and sunny."
    }
]

final_response = client.chat.completions.create(
    model="moonshotai/Kimi-K2.5",
    messages=messages
)

print(final_response.choices[0].message.content)
```

**Output Example:**

```text Output theme={null}
The weather in Beijing is **22°C and sunny**. ☀️

It's a nice day there with comfortable temperatures and clear skies!
```

#### 4.2.4 Multimodal + Tool Calling (Agentic Vision)

Combine vision understanding with tool calling for advanced agentic tasks:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

tools = [
    {
        "type": "function",
        "function": {
            "name": "search_product",
            "description": "Search for a product by name or description",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The product name or description to search for"
                    }
                },
                "required": ["query"]
            }
        }
    }
]

response = client.chat.completions.create(
    model="moonshotai/Kimi-K2.5",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": "https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png"
                    }
                },
                {
                    "type": "text",
                    "text": "Can you identify this product and search for similar items?"
                }
            ]
        }
    ],
    tools=tools
)

msg = response.choices[0].message

# Print reasoning process
if msg.reasoning_content:
    print("=== Reasoning ===")
    print(msg.reasoning_content)

# Print response content
if msg.content:
    print("=== Content ===")
    print(msg.content)

# Print tool calls
if msg.tool_calls:
    print("=== Tool Calls ===")
    for tc in msg.tool_calls:
        print(f"  Function: {tc.function.name}")
        print(f"  Arguments: {tc.function.arguments}")
```

**Output Example:**

```text Output theme={null}
=== Reasoning ===
The user is asking me to identify a product from a receipt and search for similar items.
Looking at the receipt, I can see:

 1. The store is "Auntie Anne's" - which is a popular pretzel chain
 2. The product purchased is "CINNAMON SUGAR"
 3. Price is 17,000 (likely Indonesian Rupiah based on "CASH IDR")
 4. Quantity is 1

So the product is a Cinnamon Sugar pretzel from Auntie Anne's.
Now I need to search for this product or similar items using the search_product function.
=== Content ===
I can see from the receipt that the product is a **Cinnamon Sugar** item from **Auntie Anne's** (the famous pretzel chain). This appears to be a Cinnamon Sugar Pretzel purchased for 17,000 IDR (Indonesian Rupiah).

Let me search for this product and similar items:
=== Tool Calls ===
  Function: search_product
  Arguments: {"query": "Auntie Anne's Cinnamon Sugar Pretzel"}
```

#### 4.2.5 Speculative Decoding

**Nvidia**

Deploy Kimi-K2.5 with the following command (H200/B200, all features enabled):

```shell Command theme={null}
SGLANG_ENABLE_SPEC_V2=1 sglang serve \
  --model-path moonshotai/Kimi-K2.5 \
  --tp 8 \
  --reasoning-parser kimi_k2 \
  --tool-call-parser kimi_k2 \
  --speculative-algorithm=EAGLE3 \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --speculative-draft-model-path lightseekorg/kimi-k2.5-eagle3 \
  --trust-remote-code \
  --host 0.0.0.0 \
  --port 30000
```

Deploy Kimi-K2.5-NVFP4 with the following command (B200, all features enabled):

```shell Command theme={null}
SGLANG_ENABLE_SPEC_V2=1 sglang serve \
  --model-path nvidia/Kimi-K2.5-NVFP4 \
  --tp 8 \
  --reasoning-parser kimi_k2 \
  --tool-call-parser kimi_k2 \
  --kv-cache-dtype fp8_e4m3 \
  --speculative-algorithm=EAGLE3 \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --speculative-draft-model-path lightseekorg/kimi-k2.5-eagle3 \
  --trust-remote-code \
  --host 0.0.0.0 \
  --port 30000
```

## 5. Benchmark

### 5.1 Accuracy Benchmark

#### 5.1.1 MMMU Benchmark

You can evaluate the model's accuracy using the MMMU benchmark, which tests multimodal understanding and reasoning across various subjects:

* **Benchmark Command:**

```shell Command theme={null}
python3 benchmark/mmmu/bench_sglang.py \
    --response-answer-regex "(?i)(?:answer|ans)[:\s]*(?:\*\*)?[\(\[]?([A-Za-z])[\)\]]?(?:\*\*)?" \
    --port 30000 \
    --concurrency 64
```

* **Result:**

```text Output theme={null}
Benchmark time: 2785.4322692090645
answers saved to: ./answer_sglang.json
Evaluating...
answers saved to: ./answer_sglang.json
{'Accounting': {'acc': 0.667, 'num': 30},
 'Agriculture': {'acc': 0.567, 'num': 30},
 'Architecture_and_Engineering': {'acc': 0.733, 'num': 30},
 'Art': {'acc': 0.833, 'num': 30},
 'Art_Theory': {'acc': 0.8, 'num': 30},
 'Basic_Medical_Science': {'acc': 0.833, 'num': 30},
 'Biology': {'acc': 0.6, 'num': 30},
 'Chemistry': {'acc': 0.633, 'num': 30},
 'Clinical_Medicine': {'acc': 0.733, 'num': 30},
 'Computer_Science': {'acc': 0.667, 'num': 30},
 'Design': {'acc': 0.7, 'num': 30},
 'Diagnostics_and_Laboratory_Medicine': {'acc': 0.5, 'num': 30},
 'Economics': {'acc': 0.867, 'num': 30},
 'Electronics': {'acc': 0.3, 'num': 30},
 'Energy_and_Power': {'acc': 0.767, 'num': 30},
 'Finance': {'acc': 0.833, 'num': 30},
 'Geography': {'acc': 0.667, 'num': 30},
 'History': {'acc': 0.767, 'num': 30},
 'Literature': {'acc': 0.767, 'num': 30},
 'Manage': {'acc': 0.733, 'num': 30},
 'Marketing': {'acc': 0.833, 'num': 30},
 'Materials': {'acc': 0.567, 'num': 30},
 'Math': {'acc': 0.633, 'num': 30},
 'Mechanical_Engineering': {'acc': 0.567, 'num': 30},
 'Music': {'acc': 0.5, 'num': 30},
 'Overall': {'acc': 0.698, 'num': 900},
 'Overall-Art and Design': {'acc': 0.708, 'num': 120},
 'Overall-Business': {'acc': 0.787, 'num': 150},
 'Overall-Health and Medicine': {'acc': 0.74, 'num': 150},
 'Overall-Humanities and Social Science': {'acc': 0.75, 'num': 120},
 'Overall-Science': {'acc': 0.66, 'num': 150},
 'Overall-Tech and Engineering': {'acc': 0.595, 'num': 210},
 'Pharmacy': {'acc': 0.767, 'num': 30},
 'Physics': {'acc': 0.767, 'num': 30},
 'Psychology': {'acc': 0.667, 'num': 30},
 'Public_Health': {'acc': 0.867, 'num': 30},
 'Sociology': {'acc': 0.8, 'num': 30}}
eval out saved to ./val_sglang.json
Overall accuracy: 0.698
```

### 5.2 Speed Benchmark

**Test Environment:**

* Hardware: NVIDIA H200 GPU (8x)
* Model: Kimi-K2.5
* Tensor Parallelism: 8
* SGLang Version: 0.5.6.post2

We use SGLang's built-in benchmarking tool with the `random` dataset for standardized performance evaluation.

#### 5.2.1 Latency Benchmark

* **Model Deployment:**

```bash Command theme={null}
sglang serve \
  --model-path moonshotai/Kimi-K2.5 \
  --tp 8 \
  --trust-remote-code \
  --host 0.0.0.0 \
  --port 30000
```

* **Benchmark Command:**

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* **Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  39.77
Total input tokens:                      6101
Total input text tokens:                 6101
Total generated tokens:                  4220
Total generated tokens (retokenized):    4221
Request throughput (req/s):              0.25
Input token throughput (tok/s):          153.40
Output token throughput (tok/s):         106.10
Peak output token throughput (tok/s):    156.00
Peak concurrent requests:                2
Total token throughput (tok/s):          259.50
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   3972.87
Median E2E Latency (ms):                 4044.55
P90 E2E Latency (ms):                    7046.30
P99 E2E Latency (ms):                    7441.13
---------------Time to First Token----------------
Mean TTFT (ms):                          176.89
Median TTFT (ms):                        154.24
P99 TTFT (ms):                           285.75
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          9.22
Median TPOT (ms):                        9.32
P99 TPOT (ms):                           12.72
---------------Inter-Token Latency----------------
Mean ITL (ms):                           9.02
Median ITL (ms):                         8.80
P95 ITL (ms):                            13.23
P99 ITL (ms):                            14.17
Max ITL (ms):                            29.38
==================================================
```

* Medium Concurrency (Balanced)

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  158.05
Total input tokens:                      39668
Total input text tokens:                 39668
Total generated tokens:                  40805
Total generated tokens (retokenized):    40775
Request throughput (req/s):              0.51
Input token throughput (tok/s):          250.99
Output token throughput (tok/s):         258.18
Peak output token throughput (tok/s):    1103.00
Peak concurrent requests:                19
Total token throughput (tok/s):          509.17
Concurrency:                             14.09
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   27837.05
Median E2E Latency (ms):                 23508.00
P90 E2E Latency (ms):                    57126.31
P99 E2E Latency (ms):                    66044.35
---------------Time to First Token----------------
Mean TTFT (ms):                          374.30
Median TTFT (ms):                        375.51
P99 TTFT (ms):                           695.58
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          53.25
Median TPOT (ms):                        57.93
P99 TPOT (ms):                           85.45
---------------Inter-Token Latency----------------
Mean ITL (ms):                           53.95
Median ITL (ms):                         53.97
P95 ITL (ms):                            84.74
P99 ITL (ms):                            244.84
Max ITL (ms):                            655.61
==================================================
```

* High Concurrency (Throughput-Optimized)

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 500 \
  --max-concurrency 100 \
  --request-rate inf
```

* **Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     500
Benchmark duration (s):                  996.64
Total input tokens:                      249831
Total input text tokens:                 249831
Total generated tokens:                  252662
Total generated tokens (retokenized):    252588
Request throughput (req/s):              0.50
Input token throughput (tok/s):          250.67
Output token throughput (tok/s):         253.51
Peak output token throughput (tok/s):    1199.00
Peak concurrent requests:                104
Total token throughput (tok/s):          504.18
Concurrency:                             92.70
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   184773.75
Median E2E Latency (ms):                 174183.65
P90 E2E Latency (ms):                    343625.28
P99 E2E Latency (ms):                    404284.53
---------------Time to First Token----------------
Mean TTFT (ms):                          1289.59
Median TTFT (ms):                        1313.35
P99 TTFT (ms):                           2346.78
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          364.70
Median TPOT (ms):                        403.32
P99 TPOT (ms):                           452.34
---------------Inter-Token Latency----------------
Mean ITL (ms):                           363.82
Median ITL (ms):                         316.21
P95 ITL (ms):                            745.91
P99 ITL (ms):                            1345.88
Max ITL (ms):                            3118.59
==================================================
```

**Scenario 2: Reasoning (1K/8K)**

* Low Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  680.26
Total input tokens:                      6101
Total input text tokens:                 6101
Total generated tokens:                  44462
Total generated tokens (retokenized):    44455
Request throughput (req/s):              0.01
Input token throughput (tok/s):          8.97
Output token throughput (tok/s):         65.36
Peak output token throughput (tok/s):    151.00
Peak concurrent requests:                2
Total token throughput (tok/s):          74.33
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   68019.29
Median E2E Latency (ms):                 70568.85
P90 E2E Latency (ms):                    113237.40
P99 E2E Latency (ms):                    121682.34
---------------Time to First Token----------------
Mean TTFT (ms):                          206.17
Median TTFT (ms):                        177.28
P99 TTFT (ms):                           445.37
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          14.36
Median TPOT (ms):                        15.89
P99 TPOT (ms):                           16.43
---------------Inter-Token Latency----------------
Mean ITL (ms):                           15.26
Median ITL (ms):                         15.85
P95 ITL (ms):                            17.50
P99 ITL (ms):                            23.21
Max ITL (ms):                            45.22
==================================================
```

* Medium Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  2475.98
Total input tokens:                      39668
Total input text tokens:                 39668
Total generated tokens:                  318306
Total generated tokens (retokenized):    318166
Request throughput (req/s):              0.03
Input token throughput (tok/s):          16.02
Output token throughput (tok/s):         128.56
Peak output token throughput (tok/s):    847.00
Peak concurrent requests:                18
Total token throughput (tok/s):          144.58
Concurrency:                             14.62
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   452592.46
Median E2E Latency (ms):                 486002.05
P90 E2E Latency (ms):                    833197.57
P99 E2E Latency (ms):                    957399.48
---------------Time to First Token----------------
Mean TTFT (ms):                          359.38
Median TTFT (ms):                        350.78
P99 TTFT (ms):                           500.36
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          111.18
Median TPOT (ms):                        122.76
P99 TPOT (ms):                           145.90
---------------Inter-Token Latency----------------
Mean ITL (ms):                           113.69
Median ITL (ms):                         122.81
P95 ITL (ms):                            147.87
P99 ITL (ms):                            151.03
Max ITL (ms):                            272.05
==================================================
```

* High Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 320 \
  --max-concurrency 64 \
  --request-rate inf
```

```text Output theme={null}
Waiting for completion...
```

**Scenario 3: Summarization (8K/1K)**

* Low Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  120.73
Total input tokens:                      41941
Total input text tokens:                 41941
Total generated tokens:                  4220
Total generated tokens (retokenized):    4220
Request throughput (req/s):              0.08
Input token throughput (tok/s):          347.41
Output token throughput (tok/s):         34.96
Peak output token throughput (tok/s):    73.00
Peak concurrent requests:                2
Total token throughput (tok/s):          382.36
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   12068.56
Median E2E Latency (ms):                 10211.36
P90 E2E Latency (ms):                    23203.32
P99 E2E Latency (ms):                    30677.66
---------------Time to First Token----------------
Mean TTFT (ms):                          1625.64
Median TTFT (ms):                        1526.63
P99 TTFT (ms):                           3743.51
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          24.95
Median TPOT (ms):                        23.95
P99 TPOT (ms):                           35.40
---------------Inter-Token Latency----------------
Mean ITL (ms):                           24.80
Median ITL (ms):                         21.73
P95 ITL (ms):                            59.56
P99 ITL (ms):                            61.10
Max ITL (ms):                            62.70
==================================================
```

* Medium Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  389.96
Total input tokens:                      300020
Total input text tokens:                 300020
Total generated tokens:                  41669
Total generated tokens (retokenized):    41670
Request throughput (req/s):              0.21
Input token throughput (tok/s):          769.36
Output token throughput (tok/s):         106.86
Peak output token throughput (tok/s):    304.00
Peak concurrent requests:                19
Total token throughput (tok/s):          876.22
Concurrency:                             14.95
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   72870.97
Median E2E Latency (ms):                 70495.88
P90 E2E Latency (ms):                    121820.46
P99 E2E Latency (ms):                    148933.09
---------------Time to First Token----------------
Mean TTFT (ms):                          2460.45
Median TTFT (ms):                        1976.29
P99 TTFT (ms):                           7305.53
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          140.57
Median TPOT (ms):                        142.31
P99 TPOT (ms):                           273.40
---------------Inter-Token Latency----------------
Mean ITL (ms):                           135.44
Median ITL (ms):                         95.96
P95 ITL (ms):                            152.93
P99 ITL (ms):                            1488.37
Max ITL (ms):                            6540.24
==================================================
```

* High Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 320 \
  --max-concurrency 64 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 64
Successful requests:                     320
Benchmark duration (s):                  1279.50
Total input tokens:                      1273893
Total input text tokens:                 1273893
Total generated tokens:                  170000
Total generated tokens (retokenized):    169981
Request throughput (req/s):              0.25
Input token throughput (tok/s):          995.62
Output token throughput (tok/s):         132.86
Peak output token throughput (tok/s):    703.00
Peak concurrent requests:                67
Total token throughput (tok/s):          1128.49
Concurrency:                             60.12
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   240385.63
Median E2E Latency (ms):                 236266.30
P90 E2E Latency (ms):                    429882.12
P99 E2E Latency (ms):                    515158.36
---------------Time to First Token----------------
Mean TTFT (ms):                          2710.44
Median TTFT (ms):                        2345.63
P99 TTFT (ms):                           7144.20
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          443.84
Median TPOT (ms):                        493.29
P99 TPOT (ms):                           606.19
---------------Inter-Token Latency----------------
Mean ITL (ms):                           448.23
Median ITL (ms):                         296.17
P95 ITL (ms):                            1869.15
P99 ITL (ms):                            2708.95
Max ITL (ms):                            7778.47
==================================================
```

#### 5.2.2 Speculative Decoding Benchmark

* **Model Deployment:**

```bash Command theme={null}
SGLANG_ENABLE_SPEC_V2=1 sglang serve \
  --model-path moonshotai/Kimi-K2.5 \
  --tp 8 \
  --reasoning-parser kimi_k2 \
  --tool-call-parser kimi_k2 \
  --speculative-algorithm=EAGLE3 \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --speculative-draft-model-path lightseekorg/kimi-k2.5-eagle3 \
  --trust-remote-code \
  --host 0.0.0.0 \
  --port 30000
```

* **Benchmark Command:**

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* **Results:**

```text Output theme={null}
Pending update...
```

* Medium Concurrency (Balanced)

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

```text Output theme={null}
Pending update...
```

* High Concurrency (Throughput-Optimized)

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 500 \
  --max-concurrency 100 \
  --request-rate inf
```

```text Output theme={null}
Pending update...
```

### 5.3 Speed Benchmark (AMD MI350X)

**Test Environment:**

* Hardware: AMD Instinct MI350X GPU (4x)
* Model: Kimi-K2.5 (BF16)
* Tensor Parallelism: 4
* SGLang Version: 0.5.9
* Docker Image: `lmsysorg/sglang:v0.5.9-rocm700-mi35x`
* ROCm: 7.0

We use SGLang's built-in benchmarking tool with the `random` dataset for standardized performance evaluation.

:::info AMD GPU TP Constraint
Kimi-K2.5 requires TP \<= 4 on AMD GPUs. The model has 64 attention heads, and the AITER MLA kernel requires `heads_per_gpu % 16 == 0`. With TP=4, each GPU gets 16 heads (valid). With TP=8, each GPU gets 8 heads (invalid).
:::

#### 5.3.1 Latency Benchmark

* **Model Deployment:**

```bash Command theme={null}
SGLANG_USE_AITER=1 SGLANG_ROCM_FUSED_DECODE_MLA=0 \
sglang serve \
  --model-path moonshotai/Kimi-K2.5 \
  --tp 4 \
  --mem-fraction-static 0.8 \
  --trust-remote-code \
  --reasoning-parser kimi_k2 \
  --host 0.0.0.0 \
  --port 30000
```

* **Benchmark Command:**

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* **Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  155.81
Total input tokens:                      6101
Total input text tokens:                 6101
Total generated tokens:                  4220
Total generated tokens (retokenized):    4222
Request throughput (req/s):              0.06
Input token throughput (tok/s):          39.16
Output token throughput (tok/s):         27.09
Peak output token throughput (tok/s):    29.00
Peak concurrent requests:                2
Total token throughput (tok/s):          66.24
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   15576.22
Median E2E Latency (ms):                 12539.80
P90 E2E Latency (ms):                    28150.56
P99 E2E Latency (ms):                    34873.51
---------------Time to First Token----------------
Mean TTFT (ms):                          563.50
Median TTFT (ms):                        594.92
P99 TTFT (ms):                           830.31
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          35.61
Median TPOT (ms):                        35.66
P99 TPOT (ms):                           35.77
---------------Inter-Token Latency----------------
Mean ITL (ms):                           35.66
Median ITL (ms):                         35.69
P95 ITL (ms):                            35.96
P99 ITL (ms):                            36.13
Max ITL (ms):                            36.92
==================================================
```

* Medium Concurrency (Balanced)

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-K2.5 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  526.66
Total input tokens:                      39668
Total input text tokens:                 39668
Total generated tokens:                  40805
Total generated tokens (retokenized):    40798
Request throughput (req/s):              0.15
Input token throughput (tok/s):          75.32
Output token throughput (tok/s):         77.48
Peak output token throughput (tok/s):    96.00
Peak concurrent requests:                18
Total token throughput (tok/s):          152.80
Concurrency:                             14.59
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   96023.27
Median E2E Latency (ms):                 93940.20
P90 E2E Latency (ms):                    159449.54
P99 E2E Latency (ms):                    194706.61
---------------Time to First Token----------------
Mean TTFT (ms):                          989.08
Median TTFT (ms):                        886.42
P99 TTFT (ms):                           1543.60
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          191.04
Median TPOT (ms):                        195.20
P99 TPOT (ms):                           238.84
---------------Inter-Token Latency----------------
Mean ITL (ms):                           186.68
Median ITL (ms):                         183.82
P95 ITL (ms):                            189.90
P99 ITL (ms):                            673.64
Max ITL (ms):                            1633.20
==================================================
```
