> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Qwen3-Coder

export const Qwen3CoderDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'mi300x',
        label: 'MI300X',
        default: true
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }, {
        id: 'b200',
        label: 'B200',
        default: false
      }, {
        id: 'gb200',
        label: 'GB200',
        default: false
      }, {
        id: 'xeon',
        label: 'Xeon',
        default: false
      }]
    },
    modelSize: {
      name: 'modelSize',
      title: 'Model Size',
      items: [{
        id: '480b',
        label: '480B',
        subtitle: 'MOE',
        default: true
      }, {
        id: '30b',
        label: '30B',
        subtitle: 'MOE',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      getDynamicItems: values => {
        const isXeon = values.hardware === 'xeon';
        return [{
          id: 'bf16',
          label: 'BF16',
          default: true
        }, {
          id: 'fp8',
          label: 'FP8',
          default: false,
          disabled: false,
          disabledReason: ''
        }, {
          id: 'nvfp4',
          label: 'NVFP4',
          default: false,
          disabled: isXeon,
          disabledReason: isXeon ? 'FP4 is not supported on Xeon' : ''
        }];
      }
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--tool-call-parser qwen3_coder' : null
    }
  };
  const modelConfigs = {
    '480b': {
      baseName: '480B-A35B',
      mi300x: {
        tp: 8
      },
      mi325x: {
        tp: 8
      },
      mi355x: {
        tp: 8
      },
      b200: {
        tp: 8,
        ep: 8
      },
      gb200: {
        tp: 4,
        ep: 4
      },
      xeon: {
        tp: 6
      }
    },
    '30b': {
      baseName: '30B-A3B',
      mi300x: {
        tp: 1
      },
      mi325x: {
        tp: 1
      },
      mi355x: {
        tp: 1
      },
      xeon: {
        tp: 3
      }
    }
  };
  const generateCommand = values => {
    const {hardware, modelSize, quantization} = values;
    const isNvidia = hardware === 'b200' || hardware === 'gb200';
    const isXeon = hardware === 'xeon';
    const modelConfig = modelConfigs[modelSize];
    const hwConfig = modelConfig[hardware];
    if (!hwConfig) {
      return `# Configuration not available: ${modelSize.toUpperCase()} model has not been verified on ${hardware.toUpperCase()}.`;
    }
    if (quantization === 'nvfp4' && !isNvidia) {
      return `# NVFP4 quantization is only available on NVIDIA B200/GB200 hardware.`;
    }
    if (quantization === 'bf16' && isNvidia) {
      return `# BF16 deployment on ${hardware.toUpperCase()} has not been verified yet. Please use FP8 or NVFP4.`;
    }
    let modelName;
    if (quantization === 'nvfp4') {
      modelName = `nvidia/Qwen3-Coder-${modelConfig.baseName}-Instruct-NVFP`;
    } else {
      const quantSuffix = quantization === 'fp8' ? '-FP8' : '';
      modelName = `Qwen/Qwen3-Coder-${modelConfig.baseName}-Instruct${quantSuffix}`;
    }
    let cmd = '';
    if (!isNvidia && !isXeon) {
      cmd += 'SGLANG_USE_AITER=0 ';
    }
    cmd += 'python -m sglang.launch_server \\\n';
    cmd += `  --model ${modelName}`;
    if (isXeon) {
      cmd += ` \\\n  --device cpu \\\n  --disable-overlap-schedule`;
    }
    cmd += ` \\\n  --tp ${hwConfig.tp}`;
    const ep = hwConfig.ep || (quantization === 'nvfp4' ? 1 : null);
    if (ep) {
      cmd += ` \\\n  --ep ${ep}`;
    } else if (modelSize === '480b' && quantization === 'fp8' && !isXeon) {
      cmd += ` \\\n  --ep 2`;
    }
    if (quantization === 'nvfp4') {
      cmd += ` \\\n  --enable-dp-attention`;
    }
    if (isNvidia) {
      if (quantization === 'nvfp4') {
        cmd += ` \\\n  --quantization modelopt_fp4`;
      }
    }
    Object.entries(options).forEach(([key, option]) => {
      if (option.commandRule && values[key]) {
        const additionalCmd = option.commandRule(values[key], values);
        if (additionalCmd) {
          cmd += ` \\\n  ${additionalCmd}`;
        }
      }
    });
    if (!isNvidia && !isXeon) {
      cmd += ` \\\n  --context-length 8192`;
      cmd += ` \\\n  --page-size 32`;
      if (quantization === 'fp8') {
        cmd += ` \\\n  --trust-remote-code`;
      }
    }
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      let items = option.items || [];
      if (option.getDynamicItems) {
        const defaultValues = {};
        Object.entries(options).forEach(([innerKey, innerOption]) => {
          if (innerOption.type === 'checkbox') {
            defaultValues[innerKey] = (innerOption.items || []).filter(item => item.default).map(item => item.id);
          } else if (innerOption.type === 'text') {
            defaultValues[innerKey] = innerOption.default || '';
          } else if (innerOption.items && innerOption.items.length > 0) {
            const defaultItem = innerOption.items.find(item => item.default);
            defaultValues[innerKey] = defaultItem ? defaultItem.id : innerOption.items[0].id;
          }
        });
        items = option.getDynamicItems(defaultValues);
      }
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[Qwen3-Coder](https://huggingface.co/collections/Qwen/qwen3-coder) is the latest code-focused large language model series from the Qwen team. Built on the foundation of Qwen3, Qwen3-Coder delivers exceptional performance in code generation, understanding, and reasoning tasks.

**Key Features:**

* **State-of-the-art Coding Performance**: Achieves top-tier results on HumanEval, MBPP, LiveCodeBench, and other major coding benchmarks.
* **Tool Calling Support**: Native support for function calling and tool use, enabling seamless integration with external APIs and services.
* **Extended Context Length**: Supports up to 256K tokens for processing large codebases and long documents.
* **Multilingual Code Support**: Proficient in Python, JavaScript, TypeScript, Java, C++, Go, Rust, and many other programming languages.
* **MoE Architecture**: Efficient Mixture-of-Experts design for optimal performance-to-cost ratio.
* **ROCm Support**: Compatible with AMD MI300X, MI325X and MI355X GPUs via SGLang (verified).
* **NVIDIA GPU Support**: Compatible with NVIDIA GB200 and B200 GPUs via SGLang (verified).

For more details, please refer to the [official Qwen3-Coder GitHub Repository](https://github.com/QwenLM/Qwen3-Coder).

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

For SGLang CPU installation, please refer to the [CPU version installation guide](../../../docs/hardware-platforms/cpu_server#installation).

## 3. Model Deployment

This section provides deployment configurations verified on AMD MI300X, MI325X, MI355X, NVIDIA B200, GB200, and Intel Xeon CPU hardware platforms.

### 3.1 Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model size, and quantization method.

<Qwen3CoderDeployment />

### 3.2 Configuration Tips

**AMD (MI300X/MI325X/MI355X):**

* **Memory Management**: We have verified successful deployment on MI300X/MI325X/MI355X with `--context-length 8192`. Larger context lengths may be supported but require additional memory.
* **Expert Parallelism**: For 480B-A35B with FP8 quantization, `--ep 2` is required to satisfy the dimension alignment requirement.
* **Page Size**: `--page-size 32` is recommended for MoE models to optimize memory usage.
* **Environment Variable**: If you encounter aiter-related issues, try setting `SGLANG_USE_AITER=0`.

**NVIDIA (B200/GB200):**

* **GB200 Parallelism**: Use `--tp 4 --ep 4` on GB200. B200 uses the default NVIDIA settings generated above.
* **NVFP4 Quantization**: Requires `--quantization modelopt_fp4` and uses a different model path (`nvidia/Qwen3-Coder-...`).
* **DP Attention**: NVFP4 configuration supports `--enable-dp-attention` for improved throughput.

**Intel Xeon CPU:**

* Please refer to the `Notes` part in the serving engine launching section in [the SGLang CPU server document](../../../docs/hardware-platforms/cpu_server#launch-of-the-serving-engine) to better understand how to configure the arguments, especially for TP (tensor parallel) and NUMA binding settings.

**General:**

* **Tool Use**: To enable tool calling capabilities, add `--tool-call-parser qwen3_coder` to the launch command.

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)

### 4.2 Advanced Usage

#### 4.2.1 Code Generation Example

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:30000/v1",
    timeout=3600
)

messages = [
    {
        "role": "user",
        "content": "Write a Python function that implements binary search on a sorted list. Include docstring and type hints."
    }
]

response = client.chat.completions.create(
    model="Qwen/Qwen3-Coder-480B-A35B-Instruct",
    messages=messages,
    max_tokens=2048,
    temperature=0.7
)

print(response.choices[0].message.content)
```

**Example Output:**

````text Output theme={null}
```python
from typing import List, Optional, TypeVar

T = TypeVar('T')

def binary_search(arr: List[T], target: T) -> Optional[int]:
    """
    Perform binary search on a sorted list to find the index of a target element.

    This function implements the binary search algorithm, which efficiently finds
    a target value in a sorted array by repeatedly dividing the search interval
    in half.

    Args:
        arr (List[T]): A sorted list of elements to search through.
        target (T): The element to search for in the list.

    Returns:
        Optional[int]: The index of the target element if found, None otherwise.

    Time Complexity:
        O(log n) where n is the number of elements in the array.

    Space Complexity:
        O(1) - iterative implementation uses constant extra space.

    Examples:
        >>> binary_search([1, 2, 3, 4, 5], 3)
        2
        >>> binary_search([1, 2, 3, 4, 5], 6)
        None
        >>> binary_search(['a', 'b', 'c', 'd'], 'b')
        1
        >>> binary_search([], 1)
        None
    """
    if not arr:
        return None

    left: int = 0
    right: int = len(arr) - 1

    while left <= right:
        mid: int = (left + right) // 2

        if arr[mid] == target:
            return mid
        elif arr[mid] < target:
            left = mid + 1
        else:
            right = mid - 1

    return None

# Alternative recursive implementation
def binary_search_recursive(arr: List[T], target: T, left: int = 0, right: Optional[int] = None) -> Optional[int]:
    """
    Perform binary search recursively on a sorted list to find the index of a target element.

    Args:
        arr (List[T]): A sorted list of elements to search through.
        target (T): The element to search for in the list.
        left (int): Left boundary of the search range (inclusive).
        right (Optional[int]): Right boundary of the search range (inclusive).

    Returns:
        Optional[int]: The index of the target element if found, None otherwise.

    Time Complexity:
        O(log n) where n is the number of elements in the array.

    Space Complexity:
        O(log n) due to recursive call stack.

    Examples:
        >>> binary_search_recursive([1, 2, 3, 4, 5], 3)
        2
        >>> binary_search_recursive([1, 2, 3, 4, 5], 6)
        None
    """
    if not arr:
        return None

    if right is None:
        right = len(arr) - 1

    if left > right:
        return None

    mid: int = (left + right) // 2

    if arr[mid] == target:
        return mid
    elif arr[mid] < target:
        return binary_search_recursive(arr, target, mid + 1, right)
    else:
        return binary_search_recursive(arr, target, left, mid - 1)
```

This implementation provides:

1. **Main function** (`binary_search`): An iterative implementation that's more memory-efficient
2. **Alternative function** (`binary_search_recursive`): A recursive implementation for educational purposes
3. **Type hints**: Using generics (`TypeVar`) to work with any comparable type
4. **Comprehensive docstring**: Including description, parameters, return value, complexity analysis, and examples
5. **Edge case handling**: Empty lists, elements not found, etc.
6. **Clear variable names**: Self-documenting code
7. **Examples**: Doctest-style examples in the docstring

The function works with any sorted list of comparable elements (integers, strings, etc.) and returns the index of the target element if found, or `None` if not found.
````

#### 4.2.2 Tool Calling Example

Qwen3-Coder supports tool calling capabilities. Enable the tool call parser during deployment. The following example uses 30B-A3B model:

```shell Command theme={null}
SGLANG_USE_AITER=0 python -m sglang.launch_server \
  --model Qwen/Qwen3-Coder-30B-A3B-Instruct \
  --tp 1 \
  --context-length 8192 \
  --page-size 32 \
  --tool-call-parser qwen3_coder
```

**Python Example:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:30000/v1",
    timeout=3600
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "execute_code",
            "description": "Execute Python code and return the result",
            "parameters": {
                "type": "object",
                "properties": {
                    "code": {
                        "type": "string",
                        "description": "The Python code to execute"
                    }
                },
                "required": ["code"]
            }
        }
    }
]

response = client.chat.completions.create(
    model="Qwen/Qwen3-Coder-30B-A3B-Instruct",
    messages=[
        {"role": "user", "content": "Calculate the factorial of 10 using Python"}
    ],
    tools=tools,
    temperature=0.7
)

# Check if the model wants to call a tool
if response.choices[0].message.tool_calls:
    tool_call = response.choices[0].message.tool_calls[0]
    print(f"Tool: {tool_call.function.name}")
    print(f"Arguments: {tool_call.function.arguments}")
else:
    # Model may return tool call in content format
    print(response.choices[0].message.content)
```

**Example Output:**

```text Output theme={null}
Tool: execute_code
Arguments: {"code": "def factorial(n):\n    if n == 0 or n == 1:\n        return 1\n    else:\n        return n * factorial(n-1)\n\nresult = factorial(10)\nresult"}
```

## 5. Benchmark

### 5.1 Speed Benchmark

**Test Environment:**

* Hardware: AMD MI300X GPU (8x)
* Model: Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8
* Tensor Parallelism: 8
* Expert Parallelism: 2
* sglang version: 0.5.7

We use SGLang's built-in benchmarking tool to conduct performance evaluation with random dataset.

#### 5.1.1 AMD Standard Scenario Benchmark

* Model Deployment Command:

```shell Command theme={null}
SGLANG_USE_AITER=0 python -m sglang.launch_server \
  --model Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 \
  --tp 8 \
  --ep 2 \
  --context-length 8192 \
  --page-size 32 \
  --trust-remote-code
```

##### 5.1.1.1 Low Concurrency

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  73.79
Total input tokens:                      6101
Total input text tokens:                 6101
Total generated tokens:                  4220
Total generated tokens (retokenized):    4104
Request throughput (req/s):              0.14
Input token throughput (tok/s):          82.68
Output token throughput (tok/s):         57.19
Peak output token throughput (tok/s):    59.00
Peak concurrent requests:                2
Total token throughput (tok/s):          139.86
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   7376.26
Median E2E Latency (ms):                 5851.51
P90 E2E Latency (ms):                    13351.89
P99 E2E Latency (ms):                    16908.32
---------------Time to First Token----------------
Mean TTFT (ms):                          191.93
Median TTFT (ms):                        126.06
P99 TTFT (ms):                           662.15
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          17.06
Median TPOT (ms):                        17.07
P99 TPOT (ms):                           17.08
---------------Inter-Token Latency----------------
Mean ITL (ms):                           17.06
Median ITL (ms):                         17.06
P95 ITL (ms):                            17.14
P99 ITL (ms):                            17.19
Max ITL (ms):                            18.53
==================================================
```

##### 5.1.1.2 Medium Concurrency

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  87.04
Total input tokens:                      39668
Total input text tokens:                 39668
Total generated tokens:                  40805
Total generated tokens (retokenized):    40364
Request throughput (req/s):              0.92
Input token throughput (tok/s):          455.77
Output token throughput (tok/s):         468.83
Peak output token throughput (tok/s):    608.00
Peak concurrent requests:                20
Total token throughput (tok/s):          924.59
Concurrency:                             13.76
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   14966.88
Median E2E Latency (ms):                 15871.93
P90 E2E Latency (ms):                    24983.41
P99 E2E Latency (ms):                    29504.85
---------------Time to First Token----------------
Mean TTFT (ms):                          388.94
Median TTFT (ms):                        157.49
P99 TTFT (ms):                           1318.63
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          29.41
Median TPOT (ms):                        29.22
P99 TPOT (ms):                           43.48
---------------Inter-Token Latency----------------
Mean ITL (ms):                           28.64
Median ITL (ms):                         26.42
P95 ITL (ms):                            27.51
P99 ITL (ms):                            131.63
Max ITL (ms):                            995.11
==================================================
```

##### 5.1.1.3 High Concurrency

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 320 \
  --max-concurrency 64
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 64
Successful requests:                     320
Benchmark duration (s):                  177.82
Total input tokens:                      158939
Total input text tokens:                 158939
Total generated tokens:                  170134
Total generated tokens (retokenized):    168387
Request throughput (req/s):              1.80
Input token throughput (tok/s):          893.84
Output token throughput (tok/s):         956.80
Peak output token throughput (tok/s):    1728.00
Peak concurrent requests:                70
Total token throughput (tok/s):          1850.64
Concurrency:                             58.88
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   32716.53
Median E2E Latency (ms):                 30896.37
P90 E2E Latency (ms):                    65605.24
P99 E2E Latency (ms):                    80970.63
---------------Time to First Token----------------
Mean TTFT (ms):                          372.97
Median TTFT (ms):                        181.67
P99 TTFT (ms):                           529.01
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          62.98
Median TPOT (ms):                        50.44
P99 TPOT (ms):                           204.24
---------------Inter-Token Latency----------------
Mean ITL (ms):                           60.95
Median ITL (ms):                         37.87
P95 ITL (ms):                            143.98
P99 ITL (ms):                            148.02
Max ITL (ms):                            36863.32
==================================================
```

#### 5.1.2 NVIDIA (B200/GB200) Standard Scenario Benchmark

The following runs use the same random dataset benchmark client commands as the AMD section. On B200, launch the server with the following command:

````bash theme={null}
sglang serve --model Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --tp 8 --ep 8 --context-length 8192 --page-size 32 --trust-remote-code

##### 5.1.2.1 FP8 Model

- Low Concurrency:

```text Output
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  42.68
Total input tokens:                      6101
Total input text tokens:                 6101
Total generated tokens:                  4220
Total generated tokens (retokenized):    4204
Request throughput (req/s):              0.23
Input token throughput (tok/s):          142.95
Output token throughput (tok/s):         98.88
Peak output token throughput (tok/s):    102.00
Peak concurrent requests:                2
Total token throughput (tok/s):          241.83
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   4266.06
Median E2E Latency (ms):                 3420.24
P90 E2E Latency (ms):                    7717.19
P99 E2E Latency (ms):                    9504.50
---------------Time to First Token----------------
Mean TTFT (ms):                          112.03
Median TTFT (ms):                        112.70
P99 TTFT (ms):                           115.35
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          9.87
Median TPOT (ms):                        9.86
P99 TPOT (ms):                           9.92
---------------Inter-Token Latency----------------
Mean ITL (ms):                           9.87
Median ITL (ms):                         9.87
P95 ITL (ms):                            10.06
P99 ITL (ms):                            10.18
Max ITL (ms):                            14.80
==================================================
````

* Medium Concurrency:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  60.80
Total input tokens:                      39668
Total input text tokens:                 39668
Total generated tokens:                  40805
Total generated tokens (retokenized):    40543
Request throughput (req/s):              1.32
Input token throughput (tok/s):          652.43
Output token throughput (tok/s):         671.13
Peak output token throughput (tok/s):    864.00
Peak concurrent requests:                20
Total token throughput (tok/s):          1323.57
Concurrency:                             13.93
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   10587.26
Median E2E Latency (ms):                 11486.18
P90 E2E Latency (ms):                    17374.75
P99 E2E Latency (ms):                    21107.18
---------------Time to First Token----------------
Mean TTFT (ms):                          155.27
Median TTFT (ms):                        121.57
P99 TTFT (ms):                           294.31
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          20.77
Median TPOT (ms):                        21.13
P99 TPOT (ms):                           23.62
---------------Inter-Token Latency----------------
Mean ITL (ms):                           20.49
Median ITL (ms):                         18.73
P95 ITL (ms):                            19.65
P99 ITL (ms):                            98.85
Max ITL (ms):                            536.87
==================================================
```

* High Concurrency:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 64
Successful requests:                     320
Benchmark duration (s):                  100.07
Total input tokens:                      158939
Total input text tokens:                 158939
Total generated tokens:                  170134
Total generated tokens (retokenized):    169119
Request throughput (req/s):              3.20
Input token throughput (tok/s):          1588.32
Output token throughput (tok/s):         1700.19
Peak output token throughput (tok/s):    2303.00
Peak concurrent requests:                71
Total token throughput (tok/s):          3288.51
Concurrency:                             57.93
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   18114.01
Median E2E Latency (ms):                 18279.15
P90 E2E Latency (ms):                    30557.22
P99 E2E Latency (ms):                    35889.84
---------------Time to First Token----------------
Mean TTFT (ms):                          346.40
Median TTFT (ms):                        129.75
P99 TTFT (ms):                           1370.20
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          33.76
Median TPOT (ms):                        34.62
P99 TPOT (ms):                           39.97
---------------Inter-Token Latency----------------
Mean ITL (ms):                           33.48
Median ITL (ms):                         25.70
P95 ITL (ms):                            99.36
P99 ITL (ms):                            132.30
Max ITL (ms):                            1132.39
==================================================
```

##### 5.1.2.2 NVFP4 Model

* Low Concurrency:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  34.49
Total input tokens:                      6101
Total input text tokens:                 6101
Total generated tokens:                  4220
Total generated tokens (retokenized):    4218
Request throughput (req/s):              0.29
Input token throughput (tok/s):          176.87
Output token throughput (tok/s):         122.34
Peak output token throughput (tok/s):    127.00
Peak concurrent requests:                2
Total token throughput (tok/s):          299.21
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   3448.01
Median E2E Latency (ms):                 2768.11
P90 E2E Latency (ms):                    6225.73
P99 E2E Latency (ms):                    7668.26
---------------Time to First Token----------------
Mean TTFT (ms):                          104.55
Median TTFT (ms):                        105.38
P99 TTFT (ms):                           105.63
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          7.94
Median TPOT (ms):                        7.95
P99 TPOT (ms):                           7.97
---------------Inter-Token Latency----------------
Mean ITL (ms):                           7.94
Median ITL (ms):                         7.94
P95 ITL (ms):                            8.05
P99 ITL (ms):                            8.11
Max ITL (ms):                            24.64
==================================================
```

* Medium Concurrency:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  43.30
Total input tokens:                      39668
Total input text tokens:                 39668
Total generated tokens:                  40805
Total generated tokens (retokenized):    39975
Request throughput (req/s):              1.85
Input token throughput (tok/s):          916.16
Output token throughput (tok/s):         942.42
Peak output token throughput (tok/s):    1264.00
Peak concurrent requests:                21
Total token throughput (tok/s):          1858.57
Concurrency:                             13.90
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   7521.95
Median E2E Latency (ms):                 8246.89
P90 E2E Latency (ms):                    12370.93
P99 E2E Latency (ms):                    15023.96
---------------Time to First Token----------------
Mean TTFT (ms):                          137.27
Median TTFT (ms):                        109.59
P99 TTFT (ms):                           208.78
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          14.69
Median TPOT (ms):                        14.87
P99 TPOT (ms):                           17.63
---------------Inter-Token Latency----------------
Mean ITL (ms):                           14.51
Median ITL (ms):                         12.75
P95 ITL (ms):                            13.33
P99 ITL (ms):                            92.85
Max ITL (ms):                            113.70
==================================================
```

* High Concurrency:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 64
Successful requests:                     320
Benchmark duration (s):                  73.93
Total input tokens:                      158939
Total input text tokens:                 158939
Total generated tokens:                  170134
Total generated tokens (retokenized):    168841
Request throughput (req/s):              4.33
Input token throughput (tok/s):          2149.98
Output token throughput (tok/s):         2301.42
Peak output token throughput (tok/s):    3497.00
Peak concurrent requests:                71
Total token throughput (tok/s):          4451.40
Concurrency:                             58.28
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   13463.58
Median E2E Latency (ms):                 13498.74
P90 E2E Latency (ms):                    22957.10
P99 E2E Latency (ms):                    26656.95
---------------Time to First Token----------------
Mean TTFT (ms):                          239.00
Median TTFT (ms):                        113.42
P99 TTFT (ms):                           713.87
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          25.13
Median TPOT (ms):                        26.02
P99 TPOT (ms):                           30.90
---------------Inter-Token Latency----------------
Mean ITL (ms):                           24.92
Median ITL (ms):                         16.68
P95 ITL (ms):                            93.33
P99 ITL (ms):                            119.26
Max ITL (ms):                            548.82
==================================================
```

### 5.2 Accuracy Benchmark

#### 5.2.1 GSM8K Benchmark

* **Benchmark Command:**

```shell Command theme={null}
python3 -m sglang.test.few_shot_gsm8k --num-questions 200
```

##### AMD (MI300X/MI325X/MI355X)

* **Results**:

  * Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8
    ```
    Accuracy: 0.965
    Invalid: 0.000
    Latency: 23.084 s
    Output throughput: 1148.425 token/s
    ```

##### NVIDIA (B200/GB200)

For deployment commands, see [Section 3.1](#3-1-configuration).

* Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8
  ```
  Accuracy: 0.965
  Invalid: 0.000
  Latency: 14.870 s
  Output throughput: 1777.726 token/s
  ```

* nvidia/Qwen3-Coder-480B-A35B-Instruct-NVFP (NVFP4)
  ```
  Accuracy: 0.960
  Invalid: 0.000
  Latency: 13.948 s
  Output throughput: 1988.548 token/s
  ```