> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# DeepSeek-V3.1

export const DeepSeekV31Deployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h200',
        label: 'H200',
        default: true
      }, {
        id: 'b200',
        label: 'B200',
        default: false
      }, {
        id: 'mi300x',
        label: 'MI300X',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }, {
        id: 'xeon',
        label: 'XEON',
        default: false
      }]
    },
    modelname: {
      name: 'modelname',
      title: 'Model Name',
      items: [{
        id: 'v31',
        label: 'DeepSeek-V3.1',
        default: true
      }, {
        id: 'v31terminus',
        label: 'DeepSeek-V3.1-Terminus',
        default: false
      }, {
        id: 'v31terminusint8',
        label: 'DeepSeek-V3.1-Terminus-Channel-int8',
        default: false,
        xeonOnly: true
      }]
    },
    strategy: {
      name: 'strategy',
      title: 'Deployment Strategy',
      type: 'checkbox',
      items: [{
        id: 'tp',
        label: 'TP',
        default: true,
        required: true
      }, {
        id: 'dp',
        label: 'DP attention',
        default: false,
        disabledWhen: v => v.hardware === 'xeon'
      }, {
        id: 'ep',
        label: 'EP',
        default: false,
        disabledWhen: v => v.hardware === 'xeon'
      }, {
        id: 'mtp',
        label: 'Multi-token Prediction',
        default: false,
        disabledWhen: v => v.hardware === 'xeon'
      }]
    },
    reasoningParser: {
      name: 'reasoningParser',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    }
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = option.items.filter(item => item.default).map(item => item.id);
      } else {
        const defaultItem = option.items.find(item => item.default);
        initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
      }
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => {
      const next = {
        ...prev,
        [optionName]: value
      };
      if (optionName === 'hardware') {
        if (next.hardware === 'xeon') {
          next.modelname = 'v31terminusint8';
        } else {
          const m = options.modelname.items.find(i => i.id === next.modelname);
          if (m && m.xeonOnly) {
            next.modelname = options.modelname.items.find(i => !i.xeonOnly && i.default)?.id || 'v31';
          }
        }
        const strategyItems = options.strategy.items || [];
        const current = Array.isArray(next.strategy) ? next.strategy : [];
        next.strategy = current.filter(id => {
          const item = strategyItems.find(s => s.id === id);
          if (!item) return false;
          if (typeof item.disabledWhen === 'function' && item.disabledWhen(next)) return false;
          return true;
        });
      }
      return next;
    });
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      } else {
        return {
          ...prev,
          [optionName]: currentValues.filter(id => id !== itemId)
        };
      }
    });
  };
  const generateCommand = () => {
    const {hardware, modelname, strategy, reasoningParser, toolcall} = values;
    const strategyArray = Array.isArray(strategy) ? strategy : [];
    const modelMap = {
      'v31': 'deepseek-ai/DeepSeek-V3.1',
      'v31terminus': 'deepseek-ai/DeepSeek-V3.1-Terminus',
      'v31terminusint8': 'IntervitensInc/DeepSeek-V3.1-Terminus-Channel-int8'
    };
    const modelName = modelMap[modelname];
    const isXeon = hardware === 'xeon';
    let cmd = 'python3 -m sglang.launch_server \\\n';
    cmd += `  --model-path ${modelName}`;
    if (isXeon) {
      cmd += ` \\\n  --device cpu \\\n  --disable-overlap-schedule`;
      if (modelname === 'v31terminusint8') {
        cmd += ` \\\n  --quantization w8a8_int8`;
      }
    }
    cmd += isXeon ? ` \\\n  --tp 6` : ` \\\n  --tp 8`;
    if (strategyArray.includes('dp')) {
      cmd += ` \\\n  --dp 8 \\\n  --enable-dp-attention`;
    }
    if (strategyArray.includes('ep')) {
      cmd += ` \\\n  --ep 8`;
    }
    if (strategyArray.includes('mtp')) {
      cmd += ` \\\n  --speculative-algorithm EAGLE \\\n  --speculative-num-steps 3 \\\n  --speculative-eagle-topk 1 \\\n  --speculative-num-draft-tokens 4`;
    }
    if (toolcall === 'enabled') {
      cmd += ` \\\n  --tool-call-parser deepseekv31`;
    }
    if (reasoningParser === 'enabled') {
      cmd += ` \\\n  --reasoning-parser deepseek-v3`;
    }
    if (toolcall === 'enabled') {
      cmd += ` \\\n  --chat-template ./examples/chat_template/tool_chat_template_deepseekv31.jinja`;
    }
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.type === 'checkbox' ? option.items.map(item => {
    const isChecked = (values[option.name] || []).includes(item.id);
    const dynDisabled = typeof item.disabledWhen === 'function' && item.disabledWhen(values);
    const isDisabled = item.required || dynDisabled;
    return <label key={item.id} title={dynDisabled ? 'Not supported on the selected hardware' : ''} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                    <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={e => !dynDisabled && handleCheckboxChange(option.name, item.id, e.target.checked)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  }) : option.items.map(item => {
    const isChecked = values[option.name] === item.id;
    const isDisabled = item.xeonOnly && values.hardware !== 'xeon';
    return <label key={item.id} title={isDisabled ? 'Only available when XEON hardware is selected' : undefined} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[DeepSeek V3.1](https://huggingface.co/deepseek-ai/DeepSeek-V3.1) is an advanced Mixture-of-Experts (MoE) large language model developed by DeepSeek, representing a major capability and usability upgrade over DeepSeek V3. As a refined iteration in the DeepSeek V3 family, DeepSeek V3.1 introduces a hybrid reasoning paradigm that supports both fast non-thinking responses and explicit multi-step reasoning, alongside significantly improved tool calling and agentic behavior. The model demonstrates strong performance across reasoning, mathematics, coding, long-context understanding, and real-world agent workflows, benefiting from continued training, alignment optimization, and inference-time refinements. DeepSeek V3.1 is designed to serve as a robust general-purpose foundation model, well suited for conversational AI, structured tool invocation, search-augmented generation, and complex multi-step tasks, while maintaining high efficiency through its sparse MoE architecture.

**[DeepSeek-V3.1-Terminus](https://huggingface.co/deepseek-ai/DeepSeek-V3.1-Terminus)** is an experimental version designed for general conversations and long-context processing. It features hybrid thinking capabilities, allowing you to toggle between "Think" mode for deliberate reasoning and "Non-Think" mode for faster responses. Recommended for general conversations, long-context processing, and experimental use cases.

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

For SGLang CPU installation, please refer to the [CPU version installation guide](../../../docs/hardware-platforms/cpu_server#installation).

## 3. Model Deployment

This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and thinking capabilities.

<DeepSeekV31Deployment />

### 3.2 Configuration Tips

DeepSeek-V3.1 shares the same model architecture as DeepSeek-V3, so the same hardware and optimization recommendations apply.

**Recommended GPU configurations by weight type:**

<table style={{width: "100%", borderCollapse: "collapse"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.02)"}}>Weight Type</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.05)"}}>Supported Hardware</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><strong>FP8</strong> (recommended)</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>8× H200, 8× B200, 8× MI300X, 2×8× H100/H800/H20</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><strong>BF16</strong> (upcast from FP8)</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>2×8× H200, 2×8× MI300X, 4×8× H100/H800, 4×8× A100/A800</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><strong>INT8</strong></td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>16× A100/A800, 32× L40S, Xeon 6980P CPU, 4× Atlas 800I A3</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><strong>W4A8 / AWQ / MXFP4 / NVFP4</strong></td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>8× H20/H100, 4× H200; 8× H100/A100; 8/4× MI355X/MI350X; 8/4× B200</td>
    </tr>
  </tbody>
</table>

> The official DeepSeek-V3.1 checkpoint is already in FP8 format — do **not** add `--quantization fp8` when serving it.

**DeepGEMM precompilation (NVIDIA Hopper / Blackwell):** Precompile GEMM kernels before the first server run to avoid JIT overhead (\~10 min):

```bash theme={null}
python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3.1 --tp 8 --trust-remote-code
```

DeepGEMM is enabled by default on Hopper/Blackwell and can be disabled with `SGLANG_ENABLE_JIT_DEEPGEMM=0`.

**Data Parallelism Attention (`--enable-dp-attention`):** Recommended for high-throughput scenarios with large batch sizes. Reduces KV-cache duplication across TP ranks. Use `--enable-dp-attention --tp 8 --dp 8` on a single 8-GPU node. Not recommended for low-latency, small-batch workloads.

**NCCL timeout:** If model loading is slow and you hit an NCCL timeout, increase it: `--dist-timeout 3600`.

For configuring CPU service, please refer to the `Notes` part in the serving engine launching section in [the SGLang CPU server document](../../../docs/hardware-platforms/cpu_server#launch-of-the-serving-engine) to better understand how to configure the arguments, especially for TP (tensor parallel) and NUMA binding settings.

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [Basic API Usage](../../../docs/get-started/quickstart)

### 4.2 Advanced Usage

#### 4.2.1 Reasoning Parser

DeepSeek-V3.1 supports reasoning mode. Enable the reasoning parser during deployment to separate the thinking and content sections:

```shell Command theme={null}
python -m sglang.launch_server \
  --model deepseek-ai/DeepSeek-V3.1-Terminus \
  --reasoning-parser deepseek-v3 \
  --tp 8 \
  --host 0.0.0.0 \
  --port 8000
```

**Streaming with Thinking Process:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="EMPTY"
)

# Enable streaming to see the thinking process in real-time
response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3.1-Terminus",
    messages=[
        {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
    ],
    temperature=0.7,
    max_tokens=2048,
    extra_body = {"chat_template_kwargs": {"thinking": True}},
    stream=True
)

# Process the stream
has_thinking = False
has_answer = False
thinking_started = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print answer content
        if delta.content:
            # Close thinking section and add content header
            if has_thinking and not has_answer:
                print("\n=============== Content =================", flush=True)
                has_answer = True
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
First, the problem is asking for 15% of 240. Percent means per hundred, so 15% is the same as 15 out of 100, or 15/100.

To find a percentage of a number, I can multiply the number by the percentage expressed as a decimal. So, I need to convert 15% to a decimal. To do that, I divide 15 by 100, which gives me 0.15.

Now, I multiply 0.15 by 240. So, the calculation is 0.15 × 240.

I can compute this step by step. First, I know that 15% of 100 is 15, but since 240 is larger, I need to adjust. Alternatively, I can think of 10% of 240, which is easy because 10% is just 240 divided by 10, which is 24. Then, 5% is half of 10%, so half of 24 is 12. Therefore, 15% is 10% plus 5%, so 24 plus 12, which equals 36.

I should also do the multiplication to confirm. 0.15 × 240. I can break it down: 0.15 × 200 = 30, and 0.15 × 40 = 6, so 30 + 6 = 36. Same answer.

So, 15% of 240 is 36.

The problem says "step by step," so I should present it clearly.
=============== Content =================
To find 15% of 240, follow these steps:

1. Understand that "percent" means "per hundred," so 15% is equivalent to \( \frac{15}{100} \).
2. Convert 15% to a decimal by dividing by 100: \( 15\% = \frac{15}{100} = 0.15 \).
3. Multiply the decimal by 240: \( 0.15 \times 240 \).
4. Perform the multiplication:
   - \( 0.15 \times 200 = 30 \)
   - \( 0.15 \times 40 = 6 \)
   - Add the results: \( 30 + 6 = 36 \).

Alternatively, you can find 15% by breaking it into parts:
- 10% of 240 is \( \frac{10}{100} \times 240 = 0.10 \times 240 = 24 \).
- 5% of 240 is half of 10%, so \( \frac{24}{2} = 12 \).
- Add 10% and 5%: \( 24 + 12 = 36 \).

Thus, 15% of 240 is 36.
```

**Note:** The reasoning parser captures the model's step-by-step thinking process, allowing you to see how the model arrives at its conclusions.

#### 4.2.2 Tool Calling

DeepSeek-V3.1 and DeepSeek-V3.1-Terminus support tool calling capabilities. Enable the tool call parser:

**Deployment Command:**

```shell Command theme={null}
python -m sglang.launch_server \
  --model deepseek-ai/DeepSeek-V3.1-Terminus \
  --tool-call-parser deepseekv31 \
  --reasoning-parser deepseek-v3 \
  --chat-template ./examples/chat_template/tool_chat_template_deepseekv31.jinja \
  --tp 8 \
  --host 0.0.0.0 \
  --port 8000
```

For DeepSeek-V3.1, use `--tool-call-parser deepseekv31` as well.

**Python Example (with Thinking Process):**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Make request with streaming to see thinking process
response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3.1-Terminus",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    extra_body = {"chat_template_kwargs": {"thinking": True}},
    temperature=0.7,
    stream=True
)

# Process streaming response
thinking_started = False
has_thinking = False
tool_calls_accumulator = {}

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Accumulate tool calls
        if hasattr(delta, 'tool_calls') and delta.tool_calls:
            # Close thinking section if needed
            if has_thinking and thinking_started:
                print("\n=============== Content =================\n", flush=True)
                thinking_started = False

            for tool_call in delta.tool_calls:
                index = tool_call.index
                if index not in tool_calls_accumulator:
                    tool_calls_accumulator[index] = {
                        'name': None,
                        'arguments': ''
                    }

                if tool_call.function:
                    if tool_call.function.name:
                        tool_calls_accumulator[index]['name'] = tool_call.function.name
                    if tool_call.function.arguments:
                        tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments

        # Print content
        if delta.content:
            print(delta.content, end="", flush=True)

# Print accumulated tool calls
for index, tool_call in sorted(tool_calls_accumulator.items()):
    print(f"🔧 Tool Call: {tool_call['name']}")
    print(f"   Arguments: {tool_call['arguments']}")

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
Hmm, the user is asking for the weather in Beijing. This is a straightforward request that matches exactly what the weather tool can provide.

I need to call the get_weather function with Beijing as the location parameter. The user didn't specify a temperature unit, so I'll default to Celsius since that's commonly used in most parts of the world.

The tool call format needs to be precise - just the city name and unit selection. Once I get the weather data back, I'll present it clearly to the user.I'll check the weather in Beijing for you.
=============== Content =================

🔧 Tool Call: get_weather
   Arguments: {"location": "Beijing", "unit": "celsius"}
```

**Note:**

* The reasoning parser shows how the model decides to use a tool
* Tool calls are clearly marked with the function name and arguments
* You can then execute the function and send the result back to continue the conversation

**Handling Tool Call Results:**

Please attach the code blocks below to the previous Python script.

```python Example theme={null}
# After getting the tool call, execute the function
def get_weather(location, unit="celsius"):
    # Your actual weather API call here
    return f"The weather in {location} is 22°{unit[0].upper()} and sunny."

# Send tool result back to the model
messages = [
    {"role": "user", "content": "What's the weather in Beijing?"},
    {
        "role": "assistant",
        "content": None,
        "tool_calls": [{
            "id": "call_123",
            "type": "function",
            "function": {
                "name": "get_weather",
                "arguments": '{"location": "Beijing", "unit": "celsius"}'
            }
        }]
    },
    {
        "role": "tool",
        "tool_call_id": "call_123",
        "content": get_weather("Beijing", "celsius")
    }
]

final_response = client.chat.completions.create(
    model="deepseek-ai/DeepSeek-V3.1-Terminus",
    messages=messages,
    temperature=0.7
)

print(final_response.choices[0].message.content)
# Output: "Currently, it is **22°C and sunny** in Beijing."
```

#### 4.2.3 Multi-Token Prediction (EAGLE Speculative Decoding)

DeepSeek-V3.1 shares the same architecture as DeepSeek-V3 and supports the same EAGLE-based MTP speculative decoding path. Refer to [DeepSeek-V3 §4.2.3](/cookbook/autoregressive/DeepSeek/DeepSeek-V3#4-2-3-multi-token-prediction-eagle-speculative-decoding) for the full configuration, tuning guidance, and `bench_speculative.py` reference. The `--speculative-num-steps`, `--speculative-eagle-topk`, and `--max-running-requests` recommendations apply equally to V3.1.

## 5. Benchmark

### 5.1 Speed Benchmark

**Test Environment:**

* Hardware: AMD MI300X GPU (8x)
* Model: DeepSeek-V3.1-Terminus
* Tensor Parallelism: 8
* sglang version: 0.5.7

**Benchmark Methodology:**

We use industry-standard benchmark configurations to ensure results are comparable across frameworks and hardware platforms.

#### 5.1.1 Standard Test Scenarios

Three core scenarios reflect real-world usage patterns:

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <colgroup>
    <col style={{width: "25%"}} />

    <col style={{width: "25%"}} />

    <col style={{width: "25%"}} />

    <col style={{width: "25%"}} />
  </colgroup>

  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Scenario</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Input Length</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Output Length</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Use Case</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>**Chat**</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>1K</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>1K</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Most common conversational AI workload</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>**Reasoning**</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>1K</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>8K</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Long-form generation, complex reasoning tasks</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>**Summarization**</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>8K</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>1K</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Document summarization, RAG retrieval</td>
    </tr>
  </tbody>
</table>

#### 5.1.2 Concurrency Levels

Test each scenario at different concurrency levels to capture the throughput vs. latency trade-off:

* **Low Concurrency**: `--max-concurrency 1` (Latency-optimized)
* **Medium Concurrency**: `--max-concurrency 16` (Balanced)
* **High Concurrency**: `--max-concurrency 100` (Throughput-optimized)

#### 5.1.3 Number of Prompts

For each concurrency level, configure `num_prompts` to simulate realistic user loads:

* **Quick Test**: `num_prompts = concurrency × 1` (minimal test)
* **Recommended**: `num_prompts = concurrency × 5` (standard benchmark)
* **Stable Measurements**: `num_prompts = concurrency × 10` (production-grade)

***

#### 5.1.4 Benchmark Commands

**Scenario 1: Chat (1K/1K) - Most Important**

* **Model Deployment**

```bash Command theme={null}
python -m sglang.launch_server \
  --model-path deepseek-ai/DeepSeek-V3.1 \
  --tp 8
```

* Low Concurrency (Latency-Optimized)

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  106.24
Total input tokens:                      6101
Total input text tokens:                 6101
Total input vision tokens:               0
Total generated tokens:                  4220
Total generated tokens (retokenized):    4201
Request throughput (req/s):              0.09
Input token throughput (tok/s):          57.43
Output token throughput (tok/s):         39.72
Peak output token throughput (tok/s):    43.00
Peak concurrent requests:                2
Total token throughput (tok/s):          97.15
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   10620.29
Median E2E Latency (ms):                 8868.09
---------------Time to First Token----------------
Mean TTFT (ms):                          557.85
Median TTFT (ms):                        213.58
P99 TTFT (ms):                           1625.28
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          23.84
Median TPOT (ms):                        23.90
P99 TPOT (ms):                           24.03
---------------Inter-Token Latency----------------
Mean ITL (ms):                           23.90
Median ITL (ms):                         23.92
P95 ITL (ms):                            24.15
P99 ITL (ms):                            24.25
Max ITL (ms):                            25.44
==================================================
```

* Medium Concurrency (Balanced)

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  107.71
Total input tokens:                      39668
Total input text tokens:                 39668
Total input vision tokens:               0
Total generated tokens:                  40805
Total generated tokens (retokenized):    40625
Request throughput (req/s):              0.74
Input token throughput (tok/s):          368.28
Output token throughput (tok/s):         378.84
Peak output token throughput (tok/s):    508.00
Peak concurrent requests:                19
Total token throughput (tok/s):          747.12
Concurrency:                             13.72
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   18473.65
Median E2E Latency (ms):                 19558.42
---------------Time to First Token----------------
Mean TTFT (ms):                          607.91
Median TTFT (ms):                        191.32
P99 TTFT (ms):                           2135.13
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          35.50
Median TPOT (ms):                        35.99
P99 TPOT (ms):                           43.62
---------------Inter-Token Latency----------------
Mean ITL (ms):                           35.10
Median ITL (ms):                         32.18
P95 ITL (ms):                            33.03
P99 ITL (ms):                            159.99
Max ITL (ms):                            453.99
==================================================
```

* High Concurrency (Throughput-Optimized)

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 500 \
  --max-concurrency 100 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     500
Benchmark duration (s):                  207.65
Total input tokens:                      249831
Total input text tokens:                 249831
Total input vision tokens:               0
Total generated tokens:                  252662
Total generated tokens (retokenized):    251238
Request throughput (req/s):              2.41
Input token throughput (tok/s):          1203.15
Output token throughput (tok/s):         1216.79
Peak output token throughput (tok/s):    2100.00
Peak concurrent requests:                106
Total token throughput (tok/s):          2419.94
Concurrency:                             91.02
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   37800.20
Median E2E Latency (ms):                 35921.56
---------------Time to First Token----------------
Mean TTFT (ms):                          835.15
Median TTFT (ms):                        236.88
P99 TTFT (ms):                           2868.52
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          73.33
Median TPOT (ms):                        76.35
P99 TPOT (ms):                           97.63
---------------Inter-Token Latency----------------
Mean ITL (ms):                           73.30
Median ITL (ms):                         50.82
P95 ITL (ms):                            180.67
P99 ITL (ms):                            186.83
Max ITL (ms):                            1661.39
==================================================
```

**Scenario 2: Reasoning (1K/8K)**

* Low Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  1097.29
Total input tokens:                      6101
Total input text tokens:                 6101
Total input vision tokens:               0
Total generated tokens:                  44462
Total generated tokens (retokenized):    44313
Request throughput (req/s):              0.01
Input token throughput (tok/s):          5.56
Output token throughput (tok/s):         40.52
Peak output token throughput (tok/s):    43.00
Peak concurrent requests:                2
Total token throughput (tok/s):          46.08
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   109725.52
Median E2E Latency (ms):                 117748.67
---------------Time to First Token----------------
Mean TTFT (ms):                          156.67
Median TTFT (ms):                        156.19
P99 TTFT (ms):                           159.87
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          24.41
Median TPOT (ms):                        24.51
P99 TPOT (ms):                           24.96
---------------Inter-Token Latency----------------
Mean ITL (ms):                           24.65
Median ITL (ms):                         24.58
P95 ITL (ms):                            25.68
P99 ITL (ms):                            25.93
Max ITL (ms):                            29.80
==================================================
```

* Medium Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  775.02
Total input tokens:                      39668
Total input text tokens:                 39668
Total input vision tokens:               0
Total generated tokens:                  318306
Total generated tokens (retokenized):    317426
Request throughput (req/s):              0.10
Input token throughput (tok/s):          51.18
Output token throughput (tok/s):         410.70
Peak output token throughput (tok/s):    512.00
Peak concurrent requests:                18
Total token throughput (tok/s):          461.89
Concurrency:                             13.86
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   134236.65
Median E2E Latency (ms):                 135181.28
---------------Time to First Token----------------
Mean TTFT (ms):                          214.35
Median TTFT (ms):                        194.12
P99 TTFT (ms):                           300.27
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          33.72
Median TPOT (ms):                        34.00
P99 TPOT (ms):                           34.75
---------------Inter-Token Latency----------------
Mean ITL (ms):                           33.69
Median ITL (ms):                         33.71
P95 ITL (ms):                            34.50
P99 ITL (ms):                            34.92
Max ITL (ms):                            164.76
==================================================
```

* High Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 320 \
  --max-concurrency 64 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 64
Successful requests:                     320
Benchmark duration (s):                  1231.97
Total input tokens:                      158939
Total input text tokens:                 158939
Total input vision tokens:               0
Total generated tokens:                  1301025
Total generated tokens (retokenized):    1296845
Request throughput (req/s):              0.26
Input token throughput (tok/s):          129.01
Output token throughput (tok/s):         1056.05
Peak output token throughput (tok/s):    1472.00
Peak concurrent requests:                67
Total token throughput (tok/s):          1185.07
Concurrency:                             56.17
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   216256.25
Median E2E Latency (ms):                 224192.84
---------------Time to First Token----------------
Mean TTFT (ms):                          317.68
Median TTFT (ms):                        235.28
P99 TTFT (ms):                           649.39
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          53.30
Median TPOT (ms):                        55.10
P99 TPOT (ms):                           56.58
---------------Inter-Token Latency----------------
Mean ITL (ms):                           53.13
Median ITL (ms):                         52.95
P95 ITL (ms):                            56.23
P99 ITL (ms):                            181.04
Max ITL (ms):                            208.61
==================================================
```

**Scenario 3: Summarization (8K/1K)**

* Low Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  114.47
Total input tokens:                      41941
Total input text tokens:                 41941
Total input vision tokens:               0
Total generated tokens:                  4220
Total generated tokens (retokenized):    4194
Request throughput (req/s):              0.09
Input token throughput (tok/s):          366.39
Output token throughput (tok/s):         36.87
Peak output token throughput (tok/s):    42.00
Peak concurrent requests:                2
Total token throughput (tok/s):          403.26
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   11442.86
Median E2E Latency (ms):                 9508.87
---------------Time to First Token----------------
Mean TTFT (ms):                          883.78
Median TTFT (ms):                        481.38
P99 TTFT (ms):                           2217.45
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          24.93
Median TPOT (ms):                        25.05
P99 TPOT (ms):                           26.11
---------------Inter-Token Latency----------------
Mean ITL (ms):                           25.08
Median ITL (ms):                         25.08
P95 ITL (ms):                            26.18
P99 ITL (ms):                            26.28
Max ITL (ms):                            27.41
==================================================
```

* Medium Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  162.33
Total input tokens:                      300020
Total input text tokens:                 300020
Total input vision tokens:               0
Total generated tokens:                  41669
Total generated tokens (retokenized):    41443
Request throughput (req/s):              0.49
Input token throughput (tok/s):          1848.27
Output token throughput (tok/s):         256.70
Peak output token throughput (tok/s):    467.00
Peak concurrent requests:                19
Total token throughput (tok/s):          2104.97
Concurrency:                             14.52
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   29456.89
Median E2E Latency (ms):                 27628.16
---------------Time to First Token----------------
Mean TTFT (ms):                          1784.30
Median TTFT (ms):                        1347.21
P99 TTFT (ms):                           5384.54
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          53.65
Median TPOT (ms):                        52.09
P99 TPOT (ms):                           74.39
---------------Inter-Token Latency----------------
Mean ITL (ms):                           53.23
Median ITL (ms):                         34.52
P95 ITL (ms):                            35.81
P99 ITL (ms):                            513.25
Max ITL (ms):                            2865.73
==================================================
```

* High Concurrency

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model deepseek-ai/DeepSeek-V3.1 \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 320 \
  --max-concurrency 64 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 64
Successful requests:                     320
Benchmark duration (s):                  282.55
Total input tokens:                      1273893
Total input text tokens:                 1273893
Total input vision tokens:               0
Total generated tokens:                  170000
Total generated tokens (retokenized):   169081
Request throughput (req/s):              1.13
Input token throughput (tok/s):          4508.6
Output token throughput (tok/s):         601.67
Peak output token throughput (tok/s):   1216
Peak concurrent requests:                68
Total token throughput (tok/s):         5110.27
Concurrency:                            59.81
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                  52810.32
Median E2E Latency (ms):                50981.81
---------------Time to First Token----------------
Mean TTFT (ms):                         786.69
Median TTFT (ms):                       499.38
P99 TTFT (ms):                          2925.98
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                         97.93
Median TPOT (ms):                       103.45
P99 TPOT (ms):                          157.84
---------------Inter-Token Latency----------------
Mean ITL (ms):                          98.11
Median ITL (ms):                        55.7
P95 ITL (ms):                           240.71
P99 ITL (ms):                          1114.36
==================================================
```

#### 5.1.5 Understanding the Results

**Key Metrics:**

* **Request Throughput (req/s)**: Number of requests processed per second
* **Output Token Throughput (tok/s)**: Total tokens generated per second
* **Mean TTFT (ms)**: Time to First Token - measures responsiveness
* **Mean TPOT (ms)**: Time Per Output Token - measures generation speed
* **Mean ITL (ms)**: Inter-Token Latency - measures streaming consistency

**Why These Configurations Matter:**

* **1K/1K (Chat)**: Represents the most common conversational AI workload. This is the highest priority scenario for most deployments.
* **1K/8K (Reasoning)**: Tests long-form generation capabilities crucial for complex reasoning, code generation, and detailed explanations.
* **8K/1K (Summarization)**: Evaluates performance with large context inputs, essential for RAG systems, document Q\&A, and summarization tasks.
* **Variable Concurrency**: Captures the Pareto frontier - the optimal trade-off between throughput and latency at different load levels. Low concurrency shows best-case latency, high concurrency shows maximum throughput.

**Interpreting Results:**

* Compare your results against baseline numbers for your hardware
* Higher throughput at same latency = better performance
* Lower TTFT = more responsive user experience
* Lower TPOT = faster generation speed

### 5.2 Accuracy Benchmark

Document model accuracy on standard benchmarks:

#### 5.2.1 GSM8K Benchmark

* Benchmark Command

```bash Command theme={null}
python3 benchmark/gsm8k/bench_sglang.py \
  --num-shots 8 \
  --num-questions 1316 \
  --parallel 1316
```

**Test Results:**

```text Output theme={null}
Accuracy: 0.959
Invalid: 0.000
Latency: 29.185 s
Output throughput: 4854.672 token/s
```