> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# GLM-5.1

export const GLM51Deployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h100',
        label: 'H100',
        default: false
      }, {
        id: 'h200',
        label: 'H200',
        default: true
      }, {
        id: 'b300',
        label: 'B300',
        default: false
      }, {
        id: 'gb300',
        label: 'GB300',
        default: false
      }, {
        id: 'mi300x',
        label: 'MI300X',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      getDynamicItems: values => {
        const hw = values.hardware;
        const isAMD = ['mi300x', 'mi325x', 'mi355x'].includes(hw);
        const supportsNVFP4 = ['b300', 'gb300'].includes(hw);
        const isB300 = hw === 'b300';
        const isGB300 = hw === 'gb300';
        return [{
          id: 'bf16',
          label: 'BF16',
          subtitle: 'Full Weights',
          default: isAMD,
          disabled: !isAMD,
          disabledReason: supportsNVFP4 ? 'NVFP4 is recommended for this hardware' : 'FP8 is recommended for this hardware'
        }, {
          id: 'fp8',
          label: 'FP8',
          subtitle: 'High Throughput',
          default: !isAMD && !supportsNVFP4,
          disabled: isAMD || supportsNVFP4,
          disabledReason: isAMD ? 'FP8 not verified on AMD' : supportsNVFP4 ? 'NVFP4 is recommended for this hardware' : ''
        }, {
          id: 'nvfp4',
          label: 'NVFP4',
          subtitle: 'Blackwell FP4',
          default: isB300 || isGB300,
          disabled: !supportsNVFP4,
          disabledReason: !supportsNVFP4 ? 'NVFP4 only on B300/GB300' : ''
        }];
      }
    },
    reasoning: {
      name: 'reasoning',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: false
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: true
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: false
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: true
      }]
    },
    dpattention: {
      name: 'dpattention',
      title: 'DP Attention',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        subtitle: 'Low Latency',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        subtitle: 'High Throughput',
        default: false
      }]
    },
    speculative: {
      name: 'speculative',
      title: 'Speculative Decoding',
      condition: values => !['mi300x', 'mi325x', 'mi355x'].includes(values.hardware),
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: false
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: true
      }]
    }
  };
  const modelConfigs = {
    h100: {
      fp8: {
        tp: 16,
        mem: 0.85
      }
    },
    h200: {
      fp8: {
        tp: 8,
        mem: 0.85
      }
    },
    b300: {
      nvfp4: {
        tp: 8,
        mem: 0.80
      },
      fp8: {
        tp: 8,
        mem: 0.9
      },
      bf16: {
        tp: 16,
        mem: 0.9
      }
    },
    gb300: {
      nvfp4: {
        tp: 4,
        mem: 0.80
      },
      fp8: {
        tp: 4,
        mem: 0.9
      }
    },
    mi300x: {
      bf16: {
        tp: 8,
        mem: 0.80
      }
    },
    mi325x: {
      bf16: {
        tp: 8,
        mem: 0.80
      }
    },
    mi355x: {
      bf16: {
        tp: 8,
        mem: 0.80
      }
    }
  };
  const resolveItems = (option, values) => {
    if (typeof option.getDynamicItems === 'function') return option.getDynamicItems(values);
    return option.items;
  };
  const getInitialState = () => {
    const initialState = {};
    for (const [key, option] of Object.entries(options)) {
      const items = resolveItems(option, initialState);
      const def = items.find(i => i.default && !i.disabled) || items.find(i => !i.disabled) || items[0];
      initialState[key] = def.id;
    }
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  useEffect(() => {
    setValues(prev => {
      const next = {
        ...prev
      };
      for (const [key, option] of Object.entries(options)) {
        if (typeof option.getDynamicItems !== 'function') continue;
        const items = option.getDynamicItems(next);
        const current = items.find(i => i.id === next[key]);
        if (!current || current.disabled) {
          const fallback = items.find(i => i.default && !i.disabled) || items.find(i => !i.disabled);
          if (fallback) next[key] = fallback.id;
        }
      }
      return next;
    });
  }, [values.hardware]);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const generateCommand = () => {
    const {hardware, quantization} = values;
    const isAMD = ['mi300x', 'mi325x', 'mi355x'].includes(hardware);
    const recommendsNVFP4 = ['b300', 'gb300'].includes(hardware);
    const effectiveQuant = isAMD ? 'bf16' : recommendsNVFP4 ? 'nvfp4' : 'fp8';
    const suffix = effectiveQuant === 'fp8' ? '-FP8' : '';
    const modelName = effectiveQuant === 'nvfp4' ? 'nvidia/GLM-5.1-NVFP4' : `zai-org/GLM-5.1${suffix}`;
    const hwConfig = modelConfigs[hardware][effectiveQuant];
    if (!hwConfig) return '# Configuration not available for the selected hardware and quantization.';
    const tpValue = hwConfig.tp;
    const memFraction = hwConfig.mem;
    const enableSpec = values.speculative === 'enabled';
    let cmd = 'sglang serve \\\n';
    cmd += `  --model-path ${modelName}`;
    cmd += ` \\\n  --tp ${tpValue}`;
    if (effectiveQuant === 'nvfp4') {
      cmd += ' \\\n  --quantization modelopt_fp4';
      cmd += ' \\\n  --trust-remote-code';
    }
    if (isAMD) {
      cmd += ' \\\n  --trust-remote-code';
      cmd += ' \\\n  --dsa-prefill-backend tilelang';
      cmd += ' \\\n  --dsa-decode-backend tilelang';
      cmd += ' \\\n  --chunked-prefill-size 131072';
      cmd += ' \\\n  --watchdog-timeout 1200';
    }
    if (values.dpattention === 'enabled') {
      cmd += ` \\\n  --dp ${tpValue} \\\n  --enable-dp-attention`;
    }
    if (values.reasoning === 'enabled') cmd += ' \\\n  --reasoning-parser glm45';
    if (values.toolcall === 'enabled') cmd += ' \\\n  --tool-call-parser glm47';
    if (enableSpec) {
      cmd += ' \\\n  --speculative-algorithm EAGLE';
      cmd += ' \\\n  --speculative-num-steps 3';
      cmd += ' \\\n  --speculative-eagle-topk 1';
      cmd += ' \\\n  --speculative-num-draft-tokens 4';
    }
    cmd += ` \\\n  --mem-fraction-static ${memFraction}`;
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.4
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (typeof option.condition === 'function' && !option.condition(values)) return null;
    const items = resolveItems(option, values);
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = !!item.disabled;
      return <label key={item.id} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }} title={item.disabledReason || ''}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                    {item.label}
                    {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>{item.subtitle}</small>}
                  </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

**Available Models:**

* **BF16 (Full precision)**: [zai-org/GLM-5.1](https://huggingface.co/zai-org/GLM-5.1)
* **FP8 (8-bit quantized)**: [zai-org/GLM-5.1-FP8](https://huggingface.co/zai-org/GLM-5.1-FP8)
* **NVFP4 (4-bit quantized)**: [nvidia/GLM-5.1-NVFP4](https://huggingface.co/nvidia/GLM-5.1-NVFP4)

**License:** MIT

## 2. SGLang Installation

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

## 3. Model Deployment

This section provides deployment configurations optimized for different hardware platforms and use cases.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, quantization method, and capabilities. SGLang supports serving GLM-5.1 on NVIDIA H100, H200, B300, GB300, and AMD MI300X/MI325X/MI355X GPUs.

<GLM51Deployment />

### 3.2 Configuration Tips

* Speculative decoding (MTP) can significantly reduce latency for interactive use cases.
* **DP Attention**: Enables data parallel attention for higher throughput under high concurrency. Note that DP attention trades off low-concurrency latency for high-concurrency throughput — disable it if your workload is latency-sensitive with few concurrent requests.
* The `--mem-fraction-static` flag is recommended for optimal memory utilization, adjust it based on your hardware and workload.

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Hardware</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>NVFP4</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>FP8</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>BF16</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>H100</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>—</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>tp=16</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>—</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>H200</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>—</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>tp=8</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>—</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>B300</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>tp=8</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>—</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>—</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>GB300</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>tp=4</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>—</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>—</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>MI300X/MI325X</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>—</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>tp=8</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>tp=8</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>MI355X</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>—</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>tp=8</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>tp=8</td>
    </tr>
  </tbody>
</table>

* **H100 and H200**: FP8 is the recommended deployment path.
* **B300 and GB300**: NVFP4 is the recommended deployment path. Use `nvidia/GLM-5.1-NVFP4` with `--quantization modelopt_fp4`. Use `tp=8` on B300 and `tp=4` on GB300. The CUDA 13 image variant is required for B300 and GB300.
* **AMD GPUs**: Both BF16 and FP8 checkpoints are supported on MI300X/MI325X/MI355X at tp=8. Use `--dsa-prefill-backend tilelang --dsa-decode-backend tilelang` for the DSA attention backend. Add `--chunked-prefill-size 131072` and `--watchdog-timeout 1200` (20 minutes for weight loading). FP8 uses approximately half the memory of BF16 (\~89 GB/GPU vs \~175 GB/GPU). EAGLE speculative decoding is not currently supported on AMD for GLM-5.1.
* For other configuration tips (MTP, DSA kernel, Context Parallel, HiSparse, NVFP4, Index Cache), see the [DeepSeek-V3.2 cookbook page](../DeepSeek/DeepSeek-V3_2). GLM-5.1 and DeepSeek-V3.2 share the same model structure, so the optimization techniques are common.
* Use `--json-model-override-args '{"index_topk_pattern": "FFSFSSSFSSFFFSSSFFFSFSSSSSSFFSFFSFFSSFFFFFFSFFFFFSFFSSSSSSFSFFFSFSSSFSFFSFFSSS"}'` to enable the [IndexCache](https://github.com/THUDM/IndexCache) method for GLM-5.1. This can improve serving efficiency with only a small accuracy loss. If you are running rigorous accuracy evaluations, do not enable this feature.

## 4. Model Invocation

Deploy GLM-5.1 with the following command (FP8 on H200, all features enabled):

```shell Command theme={null}
sglang serve \
  --model-path zai-org/GLM-5.1-FP8 \
  --tp 8 \
  --tool-call-parser glm47 \
  --reasoning-parser glm45 \
  --speculative-algorithm EAGLE \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --mem-fraction-static 0.85 \
  --host 0.0.0.0 \
  --port 30000
```

### 4.1 B300/GB300 (NVFP4) Server Command

#### B300

```shell Command theme={null}
sglang serve \
  --model-path nvidia/GLM-5.1-NVFP4 \
  --tp 8 \
  --quantization modelopt_fp4 \
  --tool-call-parser glm47 \
  --reasoning-parser glm45 \
  --speculative-algorithm EAGLE \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --trust-remote-code \
  --mem-fraction-static 0.80 \
  --host 0.0.0.0 \
  --port 30000
```

#### GB300

```shell Command theme={null}
sglang serve \
  --model-path nvidia/GLM-5.1-NVFP4 \
  --tp 4 \
  --quantization modelopt_fp4 \
  --tool-call-parser glm47 \
  --reasoning-parser glm45 \
  --speculative-algorithm EAGLE \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --trust-remote-code \
  --mem-fraction-static 0.80 \
  --host 0.0.0.0 \
  --port 30000
```

### 4.2 MI300X/MI325X/MI355X (ROCm) Server Command

The following ROCm commands are additional options for AMD GPUs and do not replace the NVIDIA instructions above.

#### FP8 (Recommended)

```shell Command theme={null}
sglang serve \
  --model-path zai-org/GLM-5.1-FP8 \
  --tp 8 \
  --trust-remote-code \
  --tool-call-parser glm47 \
  --reasoning-parser glm45 \
  --dsa-prefill-backend tilelang \
  --dsa-decode-backend tilelang \
  --chunked-prefill-size 131072 \
  --mem-fraction-static 0.80 \
  --watchdog-timeout 1200 \
  --host 0.0.0.0 \
  --port 30000
```

#### BF16

```shell Command theme={null}
sglang serve \
  --model-path zai-org/GLM-5.1 \
  --tp 8 \
  --trust-remote-code \
  --dsa-prefill-backend tilelang \
  --dsa-decode-backend tilelang \
  --chunked-prefill-size 131072 \
  --mem-fraction-static 0.80 \
  --watchdog-timeout 1200 \
  --host 0.0.0.0 \
  --port 30000
```

### 4.3 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)

### 4.4 Advanced Usage

#### 4.4.1 Reasoning Parser

GLM-5.1 supports Thinking mode **by default**. Enable the reasoning parser during deployment to separate the thinking and content sections. The thinking process is returned via `reasoning_content` in the streaming response.

To disable thinking and use Instruct mode, pass `chat_template_kwargs` at request time:

* **Thinking mode** (default): The model performs step-by-step reasoning before answering. No extra parameters needed.
* **Instruct mode** (`{"enable_thinking": false}`): The model responds directly without a thinking process.

**Example 1: Thinking Mode (Default)**

Thinking mode is enabled by default. The model will reason step-by-step before answering, and the thinking process is returned via `reasoning_content`:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Thinking mode is enabled by default, no extra parameters needed
response = client.chat.completions.create(
    model="zai-org/GLM-5.1-FP8",
    messages=[
        {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
    ],
    max_tokens=2048,
    stream=True
)

# Process the stream
has_thinking = False
has_answer = False
thinking_started = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print answer content
        if delta.content:
            # Close thinking section and add content header
            if has_thinking and not has_answer:
                print("\n=============== Content =================", flush=True)
                has_answer = True
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
1.  **Understand the Goal:** The user wants to find 15% of 240, and they want the solution explained step-by-step.

2.  **Identify the Core Mathematical Concept:** "Percent" means "per hundred" or "out of 100". Finding "X% of Y" translates to the mathematical operation: $(X / 100) \times Y$.

3.  **Step-by-Step Breakdown:**
    *   *Step 1: Convert the percentage to a decimal (or fraction).* 15% means 15 out of 100, which is $15/100$ or $0.15$.
    *   *Step 2: Multiply the decimal by the given number.* Multiply $0.15$ by $240$.
    *   *Step 3: Perform the calculation.*
        *   $0.15 \times 240$
        *   I can break this down further to make it easy to follow:
            *   $0.10 \times 240 = 24$ (which is 10%)
            *   $0.05 \times 240 = 12$ (which is 5%, half of 10%)
            *   $24 + 12 = 36$
        *   Alternatively, standard multiplication:
            *   $240 \times 15 = 3600$
            *   Move decimal two places left -> $36$
    *   *Step 4: State the final answer clearly.*

4.  **Draft the Response (incorporating the steps clearly):**
    *   *Introduction:* State the problem clearly.
    *   *Step 1:* Explain how to convert 15% to a decimal.
    *   *Step 2:* Explain the multiplication step.
    *   *Step 3:* Show the actual math (I'll provide the standard multiplication and the "mental math" trick as it adds value).
    *   *Conclusion:* Give the final answer.

5.  **Refine the Output (Self-Correction/Polishing during drafting):**
    *   *Drafting Step 1:* To find 15% of 240, first convert 15% into a decimal. Since percent means "per hundred," you divide 15 by 100. 15 ÷ 100 = 0.15.
    *   *Drafting Step 2:* Next, multiply this decimal by the number you are finding the percentage of (which is 240). So, calculate 0.15 × 240.
    *   *Drafting Step 3 (Standard way):* 0.15 × 240 = 36.
    *   *Adding the alternative mental math way:* It's often helpful to break it down into 10% and 5%.
        *   10% of 240 = 24 (move the decimal point one place to the left)
        *   5% is half of 10%, so half of 24 = 12
        *   Add them together: 24 + 12 = 36.
    *   *Final Answer:* 15% of 240 is 36.

6.  **Final Review against User Prompt:** Does it solve the problem? Yes. Is it step-by-step? Yes. Is it clear? Yes. (Proceed to generate output).
=============== Content =================
Here is the step-by-step solution to find 15% of 240:

**Step 1: Convert the percentage to a decimal.**
To convert a percentage to a decimal, divide it by 100 (or simply move the decimal point two places to the left).
* 15% = 15 ÷ 100 = **0.15**

**Step 2: Multiply the decimal by the number.**
Now, multiply the decimal (0.15) by the number you are finding the percentage of (240).
* 0.15 × 240 = **36**

*(Alternative mental math method for Step 2)*:
If you don't want to multiply by 0.15 directly, you can break 15% down into 10% and 5%:
* **10% of 240** = 24 (just move the decimal point one place to the left)
* **5% of 240** = 12 (5% is half of 10%, so just divide 24 by 2)
* **Add them together**: 24 + 12 = **36**

**Answer:**
15% of 240 is **36**.
```

**Example 2: Instruct Mode (Thinking Off)**

To disable thinking and get a direct response, pass `{"enable_thinking": false}` via `chat_template_kwargs`:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Disable thinking mode via chat_template_kwargs
response = client.chat.completions.create(
    model="zai-org/GLM-5.1-FP8",
    messages=[
        {"role": "user", "content": "What is 15% of 240?"}
    ],
    extra_body={"chat_template_kwargs": {"enable_thinking": False}},
    max_tokens=2048,
    stream=True
)

# In Instruct mode, the model responds directly without reasoning_content
for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta
        if delta.content:
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
15% of 240 is 36.

Here is how to calculate it:
1. Convert the percentage to a decimal: 15% = 0.15
2. Multiply the decimal by the number: 0.15 × 240 = 36
```

#### 4.4.2 Tool Calling

GLM-5.1 supports tool calling capabilities. Enable the tool call parser during deployment. Thinking mode is on by default; to disable it for tool calling requests, pass `extra_body={"chat_template_kwargs": {"enable_thinking": False}}`.

**Python Example (with Thinking Process):**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Make request with streaming to see thinking process
response = client.chat.completions.create(
    model="zai-org/GLM-5.1-FP8",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    stream=True
)

# Process streaming response
thinking_started = False
has_thinking = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print tool calls
        if hasattr(delta, 'tool_calls') and delta.tool_calls:
            # Close thinking section if needed
            if has_thinking and thinking_started:
                print("\n=============== Content =================", flush=True)
                thinking_started = False

            for tool_call in delta.tool_calls:
                if tool_call.function:
                    print(f"Tool Call: {tool_call.function.name}")
                    print(f"   Arguments: {tool_call.function.arguments}")

        # Print content
        if delta.content:
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
The user wants to know the weather in Beijing. I'll call the get_weather function with "Beijing" as the location.
=============== Content =================
Tool Call: get_weather
   Arguments:
Tool Call: None
   Arguments: {
Tool Call: None
   Arguments: "location": "Be
Tool Call: None
   Arguments: ijing"
Tool Call: None
   Arguments: }
```

## 5. Benchmark

### 5.1 Speed Benchmark

**Test Environment:**

* Hardware: H200 (8x)
* Model: GLM-5.1-FP8
* Tensor Parallelism: 8
* SGLang Version: commit 947927bdb

#### 5.1.1 Latency Benchmark

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/GLM-5.1-FP8 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  35.78
Total input tokens:                      6101
Total input text tokens:                 6101
Total generated tokens:                  4220
Total generated tokens (retokenized):    4213
Request throughput (req/s):              0.28
Input token throughput (tok/s):          170.54
Output token throughput (tok/s):         117.96
Peak output token throughput (tok/s):    148.00
Peak concurrent requests:                2
Total token throughput (tok/s):          288.50
Concurrency:                             1.00
Accept length:                           3.48
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   3576.31
Median E2E Latency (ms):                 2935.97
P90 E2E Latency (ms):                    5908.97
P99 E2E Latency (ms):                    8588.08
---------------Time to First Token----------------
Mean TTFT (ms):                          290.88
Median TTFT (ms):                        282.34
P99 TTFT (ms):                           332.27
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          7.54
Median TPOT (ms):                        6.97
P99 TPOT (ms):                           9.04
---------------Inter-Token Latency----------------
Mean ITL (ms):                           7.80
Median ITL (ms):                         6.81
P95 ITL (ms):                            13.51
P99 ITL (ms):                            26.99
Max ITL (ms):                            29.50
==================================================
```

#### 5.1.2 Throughput Benchmark

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/GLM-5.1-FP8 \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 1000 \
  --max-concurrency 100 \
  --request-rate inf
```

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     1000
Benchmark duration (s):                  411.74
Total input tokens:                      502493
Total input text tokens:                 502493
Total generated tokens:                  500251
Total generated tokens (retokenized):    499614
Request throughput (req/s):              2.43
Input token throughput (tok/s):          1220.41
Output token throughput (tok/s):         1214.97
Peak output token throughput (tok/s):    2648.00
Peak concurrent requests:                105
Total token throughput (tok/s):          2435.38
Concurrency:                             96.30
Accept length:                           3.50
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   39648.76
Median E2E Latency (ms):                 39058.12
P90 E2E Latency (ms):                    57009.82
P99 E2E Latency (ms):                    68880.33
---------------Time to First Token----------------
Mean TTFT (ms):                          20613.80
Median TTFT (ms):                        21429.21
P99 TTFT (ms):                           29543.17
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          38.73
Median TPOT (ms):                        36.52
P99 TPOT (ms):                           67.09
---------------Inter-Token Latency----------------
Mean ITL (ms):                           38.13
Median ITL (ms):                         16.57
P95 ITL (ms):                            86.01
P99 ITL (ms):                            164.88
Max ITL (ms):                            1307.02
==================================================
```

### 5.2 Accuracy Benchmark

<Note>
  The accuracy benchmark results below are shared with GLM-5, as GLM-5.1 was not independently benchmarked at the time of this writing. A separate benchmark run is planned.
</Note>

#### 5.2.1 GSM8K Benchmark

* Benchmark Command

```bash Command theme={null}
python3 benchmark/gsm8k/bench_sglang.py --port 30000
```

* Test Result

```text Output theme={null}
Accuracy: 0.955
Invalid: 0.000
Latency: 32.470 s
Output throughput: 642.044 token/s
```

#### 5.2.2 MMLU Benchmark

* Benchmark Command

```bash Command theme={null}
python3 benchmark/mmlu/bench_sglang.py --port 30000
```

* Test Result

```text Output theme={null}
subject: abstract_algebra, #q:100, acc: 0.860
subject: anatomy, #q:135, acc: 0.874
subject: astronomy, #q:152, acc: 0.941
subject: business_ethics, #q:100, acc: 0.880
subject: clinical_knowledge, #q:265, acc: 0.932
subject: college_biology, #q:144, acc: 0.972
subject: college_chemistry, #q:100, acc: 0.640
subject: college_computer_science, #q:100, acc: 0.900
subject: college_mathematics, #q:100, acc: 0.810
subject: college_medicine, #q:173, acc: 0.873
subject: college_physics, #q:102, acc: 0.912
subject: computer_security, #q:100, acc: 0.880
subject: conceptual_physics, #q:235, acc: 0.928
subject: econometrics, #q:114, acc: 0.807
subject: electrical_engineering, #q:145, acc: 0.897
subject: elementary_mathematics, #q:378, acc: 0.937
subject: formal_logic, #q:126, acc: 0.778
subject: global_facts, #q:100, acc: 0.710
subject: high_school_biology, #q:310, acc: 0.961
subject: high_school_chemistry, #q:203, acc: 0.847
subject: high_school_computer_science, #q:100, acc: 0.960
subject: high_school_european_history, #q:165, acc: 0.891
subject: high_school_geography, #q:198, acc: 0.960
subject: high_school_government_and_politics, #q:193, acc: 0.984
subject: high_school_macroeconomics, #q:390, acc: 0.923
subject: high_school_mathematics, #q:270, acc: 0.696
subject: high_school_microeconomics, #q:238, acc: 0.962
subject: high_school_physics, #q:151, acc: 0.821
subject: high_school_psychology, #q:545, acc: 0.956
subject: high_school_statistics, #q:216, acc: 0.889
subject: high_school_us_history, #q:204, acc: 0.941
subject: high_school_world_history, #q:237, acc: 0.945
subject: human_aging, #q:223, acc: 0.857
subject: human_sexuality, #q:131, acc: 0.908
subject: international_law, #q:121, acc: 0.934
subject: jurisprudence, #q:108, acc: 0.907
subject: logical_fallacies, #q:163, acc: 0.933
subject: machine_learning, #q:112, acc: 0.830
subject: management, #q:103, acc: 0.942
subject: marketing, #q:234, acc: 0.940
subject: medical_genetics, #q:100, acc: 0.990
subject: miscellaneous, #q:783, acc: 0.959
subject: moral_disputes, #q:346, acc: 0.873
subject: moral_scenarios, #q:895, acc: 0.837
subject: nutrition, #q:306, acc: 0.922
subject: philosophy, #q:311, acc: 0.897
subject: prehistory, #q:324, acc: 0.929
subject: professional_accounting, #q:282, acc: 0.844
subject: professional_law, #q:1534, acc: 0.714
subject: professional_medicine, #q:272, acc: 0.941
subject: professional_psychology, #q:612, acc: 0.913
subject: public_relations, #q:110, acc: 0.791
subject: security_studies, #q:245, acc: 0.878
subject: sociology, #q:201, acc: 0.940
subject: us_foreign_policy, #q:100, acc: 0.920
subject: virology, #q:166, acc: 0.596
subject: world_religions, #q:171, acc: 0.936
Total latency: 165.275
Average accuracy: 0.877
```

### 5.3 AMD GPU Benchmarks

#### 5.3.1 GSM8K Benchmark (MI325/MI35x)

* MI325/MI35x Test (GLM-5.1 BF16, `tp=8`, TileLang DSA backends)

```bash Command theme={null}
python3 benchmark/gsm8k/bench_sglang.py --num-questions 200
```

```text Output theme={null}
Accuracy: 0.970
Invalid: 0.000
```

Results from [AMD nightly CI](https://github.com/sgl-project/sglang/actions/runs/22556197510/attempts/2#summary-65346783629). See also [sglang#18911](https://github.com/sgl-project/sglang/pull/18911).