> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# DeepSeek-V3

export const DeepSeekV3Deployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h100',
        label: 'H100',
        default: false
      }, {
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'b200',
        label: 'B200',
        default: true
      }, {
        id: 'mi300x',
        label: 'MI300X',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }, {
        id: 'xeon',
        label: 'XEON',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      getDynamicItems: values => {
        const isXeon = values.hardware === 'xeon';
        return [{
          id: 'fp8',
          label: 'FP8',
          default: true
        }, {
          id: 'fp4',
          label: 'FP4',
          default: false,
          disabled: isXeon,
          disabledReason: 'Intel Xeon CPUs do not support FP4 quantization'
        }];
      }
    },
    strategy: {
      name: 'strategy',
      title: 'Deployment Strategy',
      type: 'checkbox',
      items: [{
        id: 'tp',
        label: 'TP',
        subtitle: 'Tensor Parallel',
        default: true,
        required: true
      }, {
        id: 'dp',
        label: 'DP',
        subtitle: 'Data Parallel',
        default: false,
        disabledWhen: v => v.hardware === 'xeon'
      }, {
        id: 'ep',
        label: 'EP',
        subtitle: 'Expert Parallel',
        default: false,
        disabledWhen: v => v.hardware === 'xeon'
      }, {
        id: 'mtp',
        label: 'MTP',
        subtitle: 'Multi-token Prediction',
        default: false,
        disabledWhen: v => v.hardware === 'xeon'
      }]
    },
    thinking: {
      name: 'thinking',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    }
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = option.items.filter(item => item.default).map(item => item.id);
      } else {
        const items = typeof option.getDynamicItems === 'function' ? option.getDynamicItems(initialState) : option.items;
        const defaultItem = items.find(item => item.default && !item.disabled) || items.find(item => !item.disabled);
        initialState[key] = defaultItem ? defaultItem.id : items[0].id;
      }
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => {
      const next = {
        ...prev,
        [optionName]: value
      };
      if (optionName === 'hardware') {
        const quantizationItems = typeof options.quantization.getDynamicItems === 'function' ? options.quantization.getDynamicItems(next) : options.quantization.items || [];
        const currentQuantization = quantizationItems.find(item => item.id === next.quantization);
        if (!currentQuantization || currentQuantization.disabled) {
          const fallback = quantizationItems.find(item => item.default && !item.disabled) || quantizationItems.find(item => !item.disabled);
          if (fallback) {
            next.quantization = fallback.id;
          }
        }
        const strategyItems = options.strategy.items || [];
        const current = Array.isArray(next.strategy) ? next.strategy : [];
        next.strategy = current.filter(id => {
          const item = strategyItems.find(s => s.id === id);
          if (!item) return false;
          if (typeof item.disabledWhen === 'function' && item.disabledWhen(next)) return false;
          return true;
        });
      }
      return next;
    });
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      } else {
        return {
          ...prev,
          [optionName]: currentValues.filter(id => id !== itemId)
        };
      }
    });
  };
  const generateCommand = () => {
    const {hardware, quantization, strategy, thinking, toolcall} = values;
    const strategyArray = Array.isArray(strategy) ? strategy : [];
    if (['h100', 'h200', 'mi300x', 'mi325x'].includes(hardware) && quantization === 'fp4') {
      return '# Error: This hardware only supports FP8 quantization\n# Please select FP8 quantization or use B200/MI355X hardware';
    }
    const modelPath = quantization === 'fp4' ? 'nvidia/DeepSeek-V3-0324-NVFP4' : 'deepseek-ai/DeepSeek-V3';
    const isXeon = hardware === 'xeon';
    let cmd = 'python3 -m sglang.launch_server \\\n';
    cmd += `  --model-path ${modelPath}`;
    if (strategyArray.includes('tp')) cmd += isXeon ? ' \\\n  --tp 6' : ' \\\n  --tp 8';
    if (strategyArray.includes('dp')) cmd += ' \\\n  --dp 8 \\\n  --enable-dp-attention';
    if (strategyArray.includes('ep')) cmd += ' \\\n  --ep 8';
    if (strategyArray.includes('mtp')) {
      cmd = 'SGLANG_ENABLE_SPEC_V2=1 ' + cmd;
      cmd += ' \\\n  --speculative-algorithm EAGLE \\\n  --speculative-num-steps 3 \\\n  --speculative-eagle-topk 1 \\\n  --speculative-num-draft-tokens 4';
    }
    if (!isXeon) {
      cmd += ' \\\n  --enable-symm-mem # Optional: improves performance, but may be unstable';
    }
    if (hardware === 'b200') {
      cmd += ' \\\n  --kv-cache-dtype fp8_e4m3 # Optional: enables fp8 kv cache and fp8 attention kernels';
    }
    if (isXeon) {
      cmd += ' \\\n  --device cpu \\\n  --disable-overlap-schedule';
    }
    if (thinking === 'enabled') cmd += ' \\\n  --reasoning-parser deepseek-v3';
    if (toolcall === 'enabled') cmd += ' \\\n  --tool-call-parser deepseekv3 \\\n  --chat-template examples/chat_template/tool_chat_template_deepseekv3.jinja';
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.type === 'checkbox' ? option.items.map(item => {
    const isChecked = (values[option.name] || []).includes(item.id);
    const dynDisabled = typeof item.disabledWhen === 'function' && item.disabledWhen(values);
    const isDisabled = item.required || dynDisabled;
    return <label key={item.id} title={dynDisabled ? 'Not supported on the selected hardware' : ''} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                    <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={e => !dynDisabled && handleCheckboxChange(option.name, item.id, e.target.checked)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  }) : (option.getDynamicItems ? option.getDynamicItems(values) : option.items).map(item => {
    const isChecked = values[option.name] === item.id;
    const isDisabled = Boolean(item.disabled);
    return <label key={item.id} title={item.disabledReason || ''} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[DeepSeek V3](https://huggingface.co/deepseek-ai/DeepSeek-V3) is a large-scale Mixture-of-Experts (MoE) language model developed by DeepSeek, designed to deliver strong general-purpose reasoning, coding, and tool-augmented capabilities with high training and inference efficiency. As the latest generation in the DeepSeek model family, DeepSeek V3 introduces systematic architectural and training innovations that significantly improve performance across reasoning, mathematics, coding, and long-context understanding, while maintaining a competitive compute cost.

Key highlights include:

* **Efficient MoE architecture**: DeepSeek V3 adopts a fine-grained Mixture-of-Experts design with a large number of experts and sparse activation, enabling high model capacity while keeping inference and training costs manageable.
* **Advanced reasoning and coding**: The model demonstrates strong performance on mathematical reasoning, logical inference, and real-world coding benchmarks, benefiting from improved data curation and training strategies.
* **Long-context capability**: DeepSeek V3 supports extended context lengths, allowing it to handle long documents, complex multi-step reasoning, and agent-style workflows more effectively.
* **Tool use and function calling**: The model is trained to support structured outputs and tool invocation, enabling seamless integration with external tools and agent frameworks during inference.

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

For SGLang CPU installation, please refer to the [CPU version installation guide](../../../docs/hardware-platforms/cpu_server#installation).

## 3. Model Deployment

This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and thinking capabilities.

<DeepSeekV3Deployment />

### 3.2 Configuration Tips

**Recommended GPU configurations by weight type:**

<table style={{width: "100%", borderCollapse: "collapse"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.02)"}}>Weight Type</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.05)"}}>Supported Hardware</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><strong>FP8</strong> (recommended)</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>8× H200, 8× B200, 8× MI300X, 2×8× H100/H800/H20</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><strong>BF16</strong> (upcast from FP8)</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>2×8× H200, 2×8× MI300X, 4×8× H100/H800, 4×8× A100/A800</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><strong>INT8</strong></td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>16× A100/A800, 32× L40S, Xeon 6980P CPU, 4× Atlas 800I A3</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><strong>W4A8 / AWQ / MXFP4 / NVFP4</strong></td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>8× H20/H100, 4× H200; 8× H100/A100; 8/4× MI355X/MI350X; 8/4× B200</td>
    </tr>
  </tbody>
</table>

> The official DeepSeek-V3 checkpoint is already in FP8 format — do **not** add `--quantization fp8` when serving it.

**DeepGEMM precompilation (NVIDIA Hopper / Blackwell):** Precompile GEMM kernels before the first server run to avoid JIT overhead (\~10 min):

```bash theme={null}
python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
```

DeepGEMM is enabled by default on Hopper/Blackwell and can be disabled with `SGLANG_ENABLE_JIT_DEEPGEMM=0`.

**Data Parallelism Attention (`--enable-dp-attention`):** Recommended for high-throughput scenarios with large batch sizes. Reduces KV-cache duplication across TP ranks. Use `--enable-dp-attention --tp 8 --dp 8` on a single 8-GPU node. Not recommended for low-latency, small-batch workloads.

**NCCL timeout:** If model loading is slow and you hit an NCCL timeout, increase it: `--dist-timeout 3600`.

For configuring CPU service, please refer to the `Notes` part in the serving engine launching section in [the SGLang CPU server document](../../../docs/hardware-platforms/cpu_server#launch-of-the-serving-engine) to better understand how to configure the arguments, especially for TP (tensor parallel) and NUMA binding settings.

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [Basic API Usage](../../../docs/get-started/quickstart)

### 4.2 Advanced Usage

#### 4.2.1 Reasoning Parser

DeepSeek-V3 supports reasoning mode. Enable the reasoning parser during deployment to separate the thinking and content sections:

```shell Command theme={null}
python -m sglang.launch_server \
  --model deepseek-ai/DeepSeek-V3 \
  --reasoning-parser deepseek-v3 \
  --tp 8
```

**Streaming with Thinking Process:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Enable streaming to see the thinking process in real-time
response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3",
    messages=[
        {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
    ],
    temperature=0.7,
    max_tokens=2048,
    extra_body = {"chat_template_kwargs": {"thinking": True}},
    stream=True
)

# Process the stream
has_thinking = False
has_answer = False
thinking_started = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print answer content
        if delta.content:
            # Close thinking section and add content header
            if has_thinking and not has_answer:
                print("\n=============== Content =================", flush=True)
                has_answer = True
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
To determine 15% of a number, follow these steps:

**Step 1: Understand the Problem**
You need to find 15% of a given number. Let's assume the number is 240 for this example.

**Step 2: Convert the Percentage to a Decimal**
To work with percentages in calculations, convert the percentage to its decimal form. To do this, divide the percentage by 100.

\[ 15\% = \frac{15}{100} = 0.15 \]

**Step 3: Multiply the Decimal by the Number**
Now, multiply the decimal form of the percentage by the number you want to find the percentage of.

\[ 0.15 \times 240 \]

**Step 4: Perform the Multiplication**
Calculate the product:

\[ 0.15 \times 240 = 36 \]

**Step 5: Conclusion**
Therefore, 15% of 240 is:

\boxed{36}

The answer is 36. To find 15% of 240, we multiply 240 by 0.15, which equals 36.
```

**Note:** The reasoning parser captures the model's step-by-step thinking process, allowing you to see how the model arrives at its conclusions.

#### 4.2.2 Tool Calling

DeepSeek-V3 supports tool calling capabilities. Enable the tool call parser:

**Deployment Command:**

```shell Command theme={null}
python -m sglang.launch_server \
  --model deepseek-ai/DeepSeek-V3 \
  --tool-call-parser deepseekv3 \
  --reasoning-parser deepseek-v3 \
  --chat-template ./examples/chat_template/tool_chat_template_deepseekv3.jinja \
  --tp 8 \
  --host 0.0.0.0 \
  --port 30000
```

**Quick Test (curl):**

```shell Command theme={null}
curl "http://127.0.0.1:30000/v1/chat/completions" \
  -H "Content-Type: application/json" \
  -d '{
    "temperature": 0,
    "max_tokens": 100,
    "model": "deepseek-ai/DeepSeek-V3",
    "tools": [{"type": "function", "function": {"name": "query_weather", "description": "Get weather of a city", "parameters": {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]}}}],
    "messages": [{"role": "user", "content": "How'\''s the weather in Beijing today?"}]
  }'
```

<Note>
  Use a low `temperature` (e.g. `0`) for more consistent tool call results. The `--chat-template` flag above provides an improved unified prompt for tool use.
</Note>

**Python Example (with Thinking Process):**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Make request with streaming to see thinking process
response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    extra_body = {"chat_template_kwargs": {"thinking": True}},
    temperature=0.7,
    stream=True
)

# Process streaming response
thinking_started = False
has_thinking = False
tool_calls_accumulator = {}

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Accumulate tool calls
        if hasattr(delta, 'tool_calls') and delta.tool_calls:
            # Close thinking section if needed
            if has_thinking and thinking_started:
                print("\n=============== Content =================\n", flush=True)
                thinking_started = False

            for tool_call in delta.tool_calls:
                index = tool_call.index
                if index not in tool_calls_accumulator:
                    tool_calls_accumulator[index] = {
                        'name': None,
                        'arguments': ''
                    }

                if tool_call.function:
                    if tool_call.function.name:
                        tool_calls_accumulator[index]['name'] = tool_call.function.name
                    if tool_call.function.arguments:
                        tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments

        # Print content
        if delta.content:
            print(delta.content, end="", flush=True)

# Print accumulated tool calls
for index, tool_call in sorted(tool_calls_accumulator.items()):
    print(f"🔧 Tool Call: {tool_call['name']}")
    print(f"   Arguments: {tool_call['arguments']}")

print()
```

**Output Example:**

```text Output theme={null}
🔧 Tool Call: get_weather
   Arguments: {"location": "Beijing", "unit": "celsius"}
```

**Note:**

* The reasoning parser shows how the model decides to use a tool
* Tool calls are clearly marked with the function name and arguments
* You can then execute the function and send the result back to continue the conversation

**Handling Tool Call Results:**

Please attach the code blocks below to the previous Python script.

```python Example theme={null}
# After getting the tool call, execute the function
def get_weather(location, unit="celsius"):
    # Your actual weather API call here
    return f"The weather in {location} is 22°{unit[0].upper()} and sunny."

# Send tool result back to the model
messages = [
    {"role": "user", "content": "What's the weather in Beijing?"},
    {
        "role": "assistant",
        "content": None,
        "tool_calls": [{
            "id": "call_123",
            "type": "function",
            "function": {
                "name": "get_weather",
                "arguments": '{"location": "Beijing", "unit": "celsius"}'
            }
        }]
    },
    {
        "role": "tool",
        "tool_call_id": "call_123",
        "content": get_weather("Beijing", "celsius")
    }
]

final_response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3",
    messages=messages,
    temperature=0.7
)

print(final_response.choices[0].message.content)
# Output: "The weather in Beijing is currently 22°C and sunny."
```

#### 4.2.3 Multi-Token Prediction (EAGLE Speculative Decoding)

SGLang implements DeepSeek V3 Multi-Token Prediction (MTP) based on [EAGLE speculative decoding](../../../docs/advanced_features/speculative_decoding#EAGLE-Decoding). With this optimization, decoding speed improves by up to **1.8×** at batch size 1 and **1.5×** at batch size 32 on H200 TP8.

**Enable with:**

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path deepseek-ai/DeepSeek-V3-0324 \
  --speculative-algorithm EAGLE \
  --trust-remote-code \
  --tp 8
```

The default configuration is `--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4`. Find the best values for your workload with [bench\_speculative.py](https://github.com/sgl-project/sglang/blob/main/scripts/playground/bench_speculative.py). The minimum viable config is `--speculative-num-steps 1 --speculative-eagle-topk 1 --speculative-num-draft-tokens 2`.

<Note>
  For large batch sizes (>48), increase `--max-running-requests` beyond the default of 48 for MTP. Also set `--cuda-graph-bs` to include your target batch sizes (default captured sizes for speculative decoding: 48).
</Note>

<Tip>
  The spec-v2 overlap scheduler is enabled by default. It improves performance by overlapping draft and verification stages. Pass `--disable-overlap-schedule` to disable.
</Tip>

#### 4.2.4 MLA Optimizations

DeepSeek V3 uses [Multi-head Latent Attention (MLA)](https://arxiv.org/pdf/2405.04434), an attention mechanism that improves inference efficiency. SGLang implements several optimizations:

* **Weight Absorption:** Reorders matrix multiplications to improve decoding phase efficiency.
* **MLA Attention Backends:** FA3, Flashinfer, FlashMLA, CutlassMLA, TRTLLM MLA (Blackwell), and Triton. FA3 is the default.
* **FP8 Quantization:** W8A8 FP8 and KV Cache FP8, with BMM operators for weight-absorbed MLA in FP8.
* **CUDA Graph & Torch.compile:** Both MLA and MoE support CUDA Graph and Torch.compile for reduced decoding latency.
* **Chunked Prefix Cache:** Increases throughput for long-sequence chunked prefill (FlashAttention3 backend only).

Overall, these optimizations achieve up to **7×** output throughput improvement vs. the baseline.

**Reference:** See [SGLang v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/#deepseek-multi-head-latent-attention-mla-throughput-optimizations) and [Slides](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/lmsys_1st_meetup_deepseek_mla.pdf) for details.

#### 4.2.5 Multi-Node Deployment

For multi-node serving and hardware-specific examples:

* [8× H200 / 4–8× B200](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#using-docker-recommended)
* [8× MI300X](../../../docs/hardware-platforms/amd_gpu#running-deepseek-v3)
* [2×8× H200 with Docker](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-two-h2008-nodes-and-docker)
* [4×8× A100](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-four-a1008-nodes)
* [8× A100 AWQ](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-8-a100a800-with-awq-quantization)
* [16× A100 INT8](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-16-a100a800-with-int8-quantization)
* [32× L40S INT8](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3#example-serving-with-32-l40s-with-int8-quantization)
* [Xeon 6980P CPU](../../../docs/hardware-platforms/cpu_server#example-running-deepseek-r1)
* [4× Atlas 800I A3 (int8)](../../../docs/hardware-platforms/ascend-npus/ascend_npu_deepseek_example#running-deepseek-with-pd-disaggregation-on-4-x-atlas-800i-a3)

**Blog references for large-scale deployment:**

* [Deploying DeepSeek on GB200 NVL72 with PD and Large Scale EP](https://lmsys.org/blog/2025-06-16-gb200-part-1/) ([Part I](https://lmsys.org/blog/2025-06-16-gb200-part-1/), [Part II](https://lmsys.org/blog/2025-09-25-gb200-part-2/))
* [PD Disaggregation and Large-Scale Expert Parallelism on 96× H100](https://lmsys.org/blog/2025-05-05-large-scale-ep/)
* [Best Practices for Serving DeepSeek-R1 on H20](https://lmsys.org/blog/2025-09-26-sglang-ant-group/)

## 5. Benchmark

### 5.1 Speed Benchmark

**Test Environment:**

* Hardware: AMD MI300X GPU (8x)
* Model: DeepSeek-V3
* Tensor Parallelism: 8
* sglang version: 0.5.7

We use SGLang's built-in benchmarking tool to conduct performance evaluation on the [ShareGPT\_Vicuna\_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. This dataset contains real conversation data and can better reflect performance in actual use scenarios. To simulate real-world usage patterns, we configure each request with 1024 input tokens and 1024 output tokens, representing typical medium-length conversations with detailed responses.

#### 5.1.1 Latency-Sensitive Benchmark

* Model Deployment Command:

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path deepseek-ai/DeepSeek-V3 \
  --tp 8 \
  --dp 8 \
  --enable-dp-attention \
  --speculative-algorithm EAGLE \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --host 0.0.0.0 \
  --port 8000
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 8000 \
  --model deepseek-ai/DeepSeek-V3 \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 10 \
  --max-concurrency 1
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  81.27
Total input tokens:                      1972
Total input text tokens:                 1972
Total input vision tokens:               0
Total generated tokens:                  2784
Total generated tokens (retokenized):    2774
Request throughput (req/s):              0.12
Input token throughput (tok/s):          24.27
Output token throughput (tok/s):         34.26
Peak output token throughput (tok/s):    65.00
Peak concurrent requests:                2
Total token throughput (tok/s):          58.52
Concurrency:                             1.00
Accept length:                           2.61
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   8123.17
Median E2E Latency (ms):                 7982.65
---------------Time to First Token----------------
Mean TTFT (ms):                          1080.76
Median TTFT (ms):                        1248.82
P99 TTFT (ms):                           1896.37
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          25.04
Median TPOT (ms):                        24.76
P99 TPOT (ms):                           32.09
---------------Inter-Token Latency----------------
Mean ITL (ms):                           25.41
Median ITL (ms):                         20.14
P95 ITL (ms):                            60.28
P99 ITL (ms):                            60.99
Max ITL (ms):                            61.49
==================================================
```

#### 5.1.2 Throughput-Sensitive Benchmark

* Model Deployment Command:

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path deepseek-ai/DeepSeek-V3 \
  --tp 8 \
  --ep 8 \
  --dp 8 \
  --enable-dp-attention \
  --host 0.0.0.0 \
  --port 8000
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 8000 \
  --model deepseek-ai/DeepSeek-V3 \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 1000 \
  --max-concurrency 100
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     1000
Benchmark duration (s):                  406.16
Total input tokens:                      301701
Total input text tokens:                 301701
Total input vision tokens:               0
Total generated tokens:                  188375
Total generated tokens (retokenized):    187542
Request throughput (req/s):              2.46
Input token throughput (tok/s):          742.81
Output token throughput (tok/s):         463.80
Peak output token throughput (tok/s):    1299.00
Peak concurrent requests:                109
Total token throughput (tok/s):          1206.61
Concurrency:                             87.53
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   35552.98
Median E2E Latency (ms):                 21466.07
---------------Time to First Token----------------
Mean TTFT (ms):                          1521.51
Median TTFT (ms):                        476.80
P99 TTFT (ms):                           8329.50
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          214.73
Median TPOT (ms):                        152.00
P99 TPOT (ms):                           1155.85
---------------Inter-Token Latency----------------
Mean ITL (ms):                           182.10
Median ITL (ms):                         79.18
P95 ITL (ms):                            398.60
P99 ITL (ms):                            1488.96
Max ITL (ms):                            43465.60
==================================================
```

### 5.2 Accuracy Benchmark

#### 5.2.1 GSM8K Benchmark

* **Benchmark Command:**

```shell Command theme={null}
python3 -m sglang.test.few_shot_gsm8k --num-questions 200 --port 8000
```

* **Test Results**:
  * DeepSeek-V3
    ```text Output theme={null}
    Accuracy: 0.960
    Invalid: 0.000
    Latency: 32.450 s
    Output throughput: 614.211 token/s
    ```

#### 5.2.2 MMLU Benchmark

* **Benchmark Command:**

```shell Command theme={null}
cd sglang
bash benchmark/mmlu/download_data.sh
python3 benchmark/mmlu/bench_sglang.py --nsub 10 --port 8000
```

* **Test Results**:
  * DeepSeek-V3
    ```text Output theme={null}
    subject: abstract_algebra, #q:100, acc: 0.800
    subject: anatomy, #q:135, acc: 0.874
    subject: astronomy, #q:152, acc: 0.928
    subject: business_ethics, #q:100, acc: 0.880
    subject: clinical_knowledge, #q:265, acc: 0.928
    subject: college_biology, #q:144, acc: 0.965
    subject: college_chemistry, #q:100, acc: 0.670
    subject: college_computer_science, #q:100, acc: 0.840
    subject: college_mathematics, #q:100, acc: 0.800
    subject: college_medicine, #q:173, acc: 0.861
    Total latency: 58.339
    Average accuracy: 0.871
    ```
