> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Ling-2.6

export const Ling261TDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'gb300',
        label: 'GB300 ×4 (1 node)',
        default: true
      }, {
        id: 'gb200',
        label: 'GB200 ×4 (1 node)',
        default: false
      }, {
        id: 'h200',
        label: 'H200 ×8 (2 nodes)',
        default: false
      }, {
        id: 'b200',
        label: 'B200 ×8 (2 nodes)',
        default: false
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        default: true
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: false
      }]
    },
    reasoning: {
      name: 'reasoning',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'qwen3 (split <think>)',
        default: false
      }]
    }
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = option.items.filter(item => item.default).map(item => item.id);
      } else {
        const defaultItem = option.items.find(item => item.default);
        initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
      }
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      } else {
        return {
          ...prev,
          [optionName]: currentValues.filter(id => id !== itemId)
        };
      }
    });
  };
  const generateCommand = () => {
    const {hardware, toolcall, reasoning} = values;
    const isSingleNode = hardware === 'gb300' || hardware === 'gb200';
    const tail = cmd => {
      let out = cmd;
      out += ` \\\n  --model-loader-extra-config '{"enable_multithread_load":"true","num_threads":64}'`;
      if (toolcall === 'enabled') out += ` \\\n  --tool-call-parser qwen`;
      if (reasoning === 'enabled') out += ` \\\n  --reasoning-parser qwen3`;
      return out;
    };
    if (isSingleNode) {
      let cmd = `sglang serve \\\n`;
      cmd += `  --model-path inclusionAI/Ling-2.6-1T \\\n`;
      cmd += `  --tp-size 4 \\\n`;
      cmd += `  --trust-remote-code \\\n`;
      cmd += `  --host 0.0.0.0 \\\n`;
      cmd += `  --port \${PORT}`;
      return tail(cmd);
    }
    const generateNodeCmd = rank => {
      let cmd = `sglang serve \\\n`;
      cmd += `  --model-path inclusionAI/Ling-2.6-1T \\\n`;
      cmd += `  --tp-size 8 \\\n`;
      cmd += `  --pp-size 2 \\\n`;
      cmd += `  --nnodes 2 \\\n`;
      cmd += `  --node-rank ${rank} \\\n`;
      cmd += `  --trust-remote-code \\\n`;
      if (rank === 0) {
        cmd += `  --host 0.0.0.0 \\\n`;
        cmd += `  --port \${PORT} \\\n`;
      }
      cmd += `  --dist-init-addr \${MASTER_IP}:\${DIST_PORT}`;
      return tail(cmd);
    };
    let output = `# MASTER_IP is Node 0 IP. PORT and DIST_PORT can be assigned by yourself.\n\n`;
    output += `# Node 0:\n`;
    output += generateNodeCmd(0);
    output += `\n\n\n# Node 1:\n`;
    output += generateNodeCmd(1);
    return output;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.type === 'checkbox' ? option.items.map(item => {
    const isChecked = (values[option.name] || []).includes(item.id);
    const isItemDisabled = item.required;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isItemDisabled ? disabledStyle : {}
    }}>
                    <input type="checkbox" checked={isChecked} disabled={isItemDisabled} onChange={e => handleCheckboxChange(option.name, item.id, e.target.checked)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  }) : option.items.map(item => {
    const isChecked = values[option.name] === item.id;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {}
    }}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} onChange={() => handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

export const Ling26FlashDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h20',
        label: 'H20-3e ×4',
        default: true
      }, {
        id: 'h100',
        label: 'H100 ×4',
        default: false
      }, {
        id: 'h200',
        label: 'H200 ×4',
        default: false
      }, {
        id: 'b200',
        label: 'B200 ×4',
        default: false
      }]
    },
    yarn: {
      name: 'yarn',
      title: 'Context Length',
      items: [{
        id: 'enabled',
        label: '256K (YaRN ×2)',
        default: true
      }, {
        id: 'disabled',
        label: '128K (default)',
        default: false
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        default: true
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: false
      }]
    },
    reasoning: {
      name: 'reasoning',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'qwen3 (split <think>)',
        default: false
      }]
    }
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = option.items.filter(item => item.default).map(item => item.id);
      } else {
        const defaultItem = option.items.find(item => item.default);
        initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
      }
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      } else {
        return {
          ...prev,
          [optionName]: currentValues.filter(id => id !== itemId)
        };
      }
    });
  };
  const generateCommand = () => {
    const {yarn, toolcall, reasoning} = values;
    let cmd = `sglang serve \\\n`;
    cmd += `  --model-path inclusionAI/Ling-2.6-flash \\\n`;
    cmd += `  --tp-size 4 \\\n`;
    cmd += `  --trust-remote-code \\\n`;
    cmd += `  --host 0.0.0.0 \\\n`;
    cmd += `  --port \${PORT}`;
    if (yarn === 'enabled') {
      cmd += ` \\\n  --context-length 262144`;
      cmd += ` \\\n  --json-model-override-args '{"rope_scaling": {"rope_type": "yarn", "factor": 2.0, "rope_theta": 6000000, "partial_rotary_factor": 0.5, "original_max_position_embeddings": 131072}}'`;
    }
    if (toolcall === 'enabled') {
      cmd += ` \\\n  --tool-call-parser qwen25`;
    }
    if (reasoning === 'enabled') {
      cmd += ` \\\n  --reasoning-parser qwen3`;
    }
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.type === 'checkbox' ? option.items.map(item => {
    const isChecked = (values[option.name] || []).includes(item.id);
    const isItemDisabled = item.required;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isItemDisabled ? disabledStyle : {}
    }}>
                    <input type="checkbox" checked={isChecked} disabled={isItemDisabled} onChange={e => handleCheckboxChange(option.name, item.id, e.target.checked)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  }) : option.items.map(item => {
    const isChecked = values[option.name] === item.id;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {}
    }}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} onChange={() => handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

The **Ling-2.6** family from inclusionAI is the next iteration of the Ling instant-model series. Continuing the architectural direction set by Ling-2.5, Ling-2.6 doubles down on **inference efficiency**, **token efficiency**, and **agent performance** — staying competitive with frontier instant models while being faster, leaner, and better suited for production agent workloads.

**Key Features:**

* **Hybrid Linear Attention**: A `1:7 MLA + Lightning Linear` hybrid built on top of a highly sparse MoE backbone. Compared with same-class SOTA models, Ling-2.6-flash shows up to \~4× higher prefill and decode throughput in long-context scenarios; Ling-2.6-1T is shipped in FP8 so it fits a single GB300 node with `--tp 4`.
* **Token Efficiency**: Trained with explicit token-efficiency objectives. On the full Artificial Analysis suite, Ling-2.6-flash uses only \~15M output tokens while remaining competitive — a meaningfully stronger intelligence-per-token profile than long-reasoning peers.
* **Agentic Capabilities**: Refined for tool use, multi-step planning, and long-horizon execution. Reaches SOTA-class results on **BFCL-V4**, **TAU2-bench**, **SWE-bench Verified**, **Claw-Eval**, and **PinchBench**, and is validated against Claude Code, Kilo Code, Qwen Code, Hermes Agent, and OpenClaw.
* **Long Context**: Native 128K, extendable to **256K (Ling-2.6-flash)** and **256K → 1M (Ling-2.6-1T via YaRN)**.

**Available Models:**

* **BF16**: [inclusionAI/Ling-2.6-flash](https://huggingface.co/inclusionAI/Ling-2.6-flash) — 104B total / 7.4B active
* **FP8 (E4M3)**: [inclusionAI/Ling-2.6-1T](https://huggingface.co/inclusionAI/Ling-2.6-1T) — \~1T total

**License:** MIT

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

## 3. Model Deployment

### 3.1 Ling-2.6-flash

Ling-2.6-flash is a 104B/7.4B-active MoE that runs comfortably on a single 4-GPU node. Use the selector below to generate the launch command for your hardware.

<Ling26FlashDeployment />

#### Configuration Tips

* `--trust-remote-code` is required (custom `BailingMoeV2_5ForCausalLM` modeling code).
* `--tp-size 4` is the reference layout. On 4× H20-3e the model reaches \~340 tokens/s decode at TP=4, batch 32.
* Native context is 128K. Enable YaRN (`--json-model-override-args '{"rope_scaling": {"rope_type": "yarn", "factor": 2.0, ...}}'`) to extend to 256K — the snippet does this for you.
* `--tool-call-parser qwen25` matches the model's `<tool_call>...</tool_call>` schema.
* The recommended baseline does **not** include `--reasoning-parser qwen3`. Ling-2.6 is a controllable-reasoning model whose chat template defaults to `detailed thinking off`; the SGLang `qwen3` reasoning parser, in contrast, assumes default-thinking semantics and would mis-route normal output into `reasoning_content`. Only enable it if you specifically want `<think>...</think>` blocks split out — see [§4.3 Thinking Mode](#4-3-thinking-mode).
* **MTP (multi-token prediction)** is supported. Add `--speculative-algorithm NEXTN --speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4 --mamba-scheduler-strategy extra_buffer` to enable it — see the [model card](https://huggingface.co/inclusionAI/Ling-2.6-flash#run-inference) for the full example.

### 3.2 Ling-2.6-1T

Ling-2.6-1T ships in **FP8 (E4M3)**, so unlike Ling-2.5-1T it fits a **single GB300 node with `--tp 4`**. On smaller GPUs (H200/B200), a 2-node deployment with `--pp-size 2` is required.

<Ling261TDeployment />

#### Configuration Tips

* `--trust-remote-code` is required for the custom modeling code.
* `--model-loader-extra-config '{"enable_multithread_load":"true","num_threads":64}'` significantly speeds up the multi-shard FP8 weight load (26 safetensors shards + an MTP layer).
* Use `--tool-call-parser qwen` for tool calling.
* The recommended baseline does **not** include `--reasoning-parser qwen3`. Ling-2.6's chat template defaults to `detailed thinking off`, while SGLang's `qwen3` reasoning parser assumes default-thinking semantics — combining the two requires a per-request workaround for tool calls (see [§4.3 Thinking Mode](#4-3-thinking-mode)). Only enable `--reasoning-parser qwen3` if you specifically want `<think>...</think>` blocks split into `reasoning_content`.
* For 2-node deployments, set `MASTER_IP`, `PORT`, and `DIST_PORT` consistently across both nodes.

## 4. Model Invocation

For example, launch a Ling-2.6-1T server on a single GB300 node:

```bash Command theme={null}
sglang serve \
  --model-path inclusionAI/Ling-2.6-1T \
  --tp-size 4 \
  --trust-remote-code \
  --host 0.0.0.0 \
  --port 30000 \
  --tool-call-parser qwen \
  --model-loader-extra-config '{"enable_multithread_load":"true","num_threads":64}'
```

### 4.1 Basic Usage

```bash Command theme={null}
curl -s http://${MASTER_IP}:${PORT}/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{"model": "auto", "messages": [{"role": "user", "content": "What is the capital of France?"}]}'
```

Output:

```json Config theme={null}
{
  "id": "...",
  "object": "chat.completion",
  "model": "auto",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "The capital of France is **Paris**.",
        "reasoning_content": null,
        "tool_calls": null
      },
      "finish_reason": "stop"
    }
  ]
}
```

### 4.2 Tool Calling Example

```bash Command theme={null}
curl -s http://${MASTER_IP}:${PORT}/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "auto",
    "messages": [{"role": "user", "content": "Search for the latest news about AI"}],
    "tools": [{
      "type": "function",
      "function": {
        "name": "search",
        "description": "Search for information on the internet",
        "parameters": {
          "type": "object",
          "properties": {
            "query": {"type": "string", "description": "The search query"}
          },
          "required": ["query"]
        }
      }
    }],
    "tool_choice": "auto"
  }'
```

Output:

```json Config theme={null}
{
  "choices": [
    {
      "message": {
        "role": "assistant",
        "content": null,
        "tool_calls": [
          {
            "id": "call_...",
            "type": "function",
            "function": {
              "name": "search",
              "arguments": "{\"query\": \"latest news about AI\"}"
            }
          }
        ]
      },
      "finish_reason": "tool_calls"
    }
  ]
}
```

### 4.3 Thinking Mode

Both Ling-2.6-flash and Ling-2.6-1T are **controllable-reasoning** models. Their chat template uses textual directives in the system message — `detailed thinking on` or `detailed thinking off` — to toggle thinking. The template **defaults to `detailed thinking off`** when neither phrase is present, and it does **not** read the Qwen3-style `enable_thinking` template variable.

#### Enabling thinking

Include `detailed thinking on` in the first system message:

```bash Command theme={null}
curl -s http://${MASTER_IP}:${PORT}/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "auto",
    "messages": [
      {"role": "system", "content": "detailed thinking on"},
      {"role": "user", "content": "If a box has 12 red balls and 8 blue balls, then 5 red balls are removed, how many balls remain?"}
    ]
  }'
```

If you already have a system prompt, append the directive on its own line:

```json theme={null}
{"role": "system", "content": "You are a helpful assistant.\ndetailed thinking on"}
```

When thinking is on, the model emits `<think>...</think>` blocks before its final answer. To get those split into `message.reasoning_content` automatically, also launch the server with `--reasoning-parser qwen3`.

#### Caveat: `--reasoning-parser qwen3` + tool calling

The SGLang `qwen3` reasoning parser was written for Qwen3, where models are **default-thinking** and clients opt out via `chat_template_kwargs.enable_thinking=false`. Ling-2.6 is the opposite — default-non-thinking, with toggling done in the system message. As a result, when the server is launched with **both** `--tool-call-parser qwen` and `--reasoning-parser qwen3`, every tool-call request must include `chat_template_kwargs.enable_thinking=false`, otherwise the parser routes the `<tool_call>...</tool_call>` block into `reasoning_content` instead of `message.tool_calls`:

```bash Command theme={null}
curl -s http://${MASTER_IP}:${PORT}/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "auto",
    "messages": [{"role": "user", "content": "Search for the latest news about AI"}],
    "tools": [...],
    "tool_choice": "auto",
    "chat_template_kwargs": {"enable_thinking": false}
  }'
```

`enable_thinking` here is consumed by the SGLang reasoning parser, **not** by the chat template — Ling-2.6's template ignores it. For the simplest configuration, just omit `--reasoning-parser qwen3` and toggle thinking via the system message.

For more API examples, see the [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request).

## 5. Benchmark

### GSM8K (Ling-2.6-1T, GB300 × 4)

Reference run on a single GB300 node with `--tp 4`:

```bash Command theme={null}
python3 benchmark/gsm8k/bench_sglang.py
```

```text Output theme={null}
Accuracy: 0.9621 (1269 / 1319)
```

For Ling-2.6-flash, see the official numbers on the [model card](https://huggingface.co/inclusionAI/Ling-2.6-flash) (BFCL-V4, TAU2-bench, SWE-bench Verified, Claw-Eval, PinchBench, Artificial Analysis).
