> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# GLM Glyph

export const GLMGlyphDeployment = () => {
  const modelFamily = 'zai-org';
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'b200',
        label: 'B200',
        default: true
      }, {
        id: 'h100',
        label: 'H100',
        default: false
      }, {
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'mi300x',
        label: 'MI300X',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      items: [{
        id: 'bf16',
        label: 'BF16',
        default: true
      }, {
        id: 'fp8',
        label: 'FP8',
        default: false
      }]
    },
    reasoning: {
      name: 'reasoning',
      title: 'Reasoning Parser',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        default: true
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--reasoning-parser glm45' : null
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        default: true
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--tool-call-parser glm45' : null
    }
  };
  const modelConfig = {
    baseName: 'Glyph',
    b200: {
      tp: 4,
      bf16: true,
      fp8: true
    },
    h100: {
      tp: 4,
      bf16: true,
      fp8: true
    },
    h200: {
      tp: 4,
      bf16: true,
      fp8: true
    },
    mi300x: {
      tp: 4,
      bf16: true,
      fp8: true
    },
    mi325x: {
      tp: 4,
      bf16: true,
      fp8: true
    },
    mi355x: {
      tp: 2,
      bf16: true,
      fp8: true
    }
  };
  const generateCommand = values => {
    const {hardware, quantization} = values;
    const hwConfig = modelConfig[hardware];
    if (!hwConfig) {
      return `# Error: Unknown hardware platform: ${hardware}`;
    }
    const quantSuffix = quantization === 'fp8' ? '-FP8' : '';
    const modelName = `${modelFamily}/${modelConfig.baseName}${quantSuffix}`;
    const isAMD = ['mi300x', 'mi325x', 'mi355x'].includes(hardware);
    let cmd = '';
    if (isAMD) {
      cmd = 'python3 -m sglang.launch_server \\\n';
      cmd += `  --model-path ${modelName}`;
      cmd += ` \\\n  --tp ${hwConfig.tp}`;
    } else {
      cmd = 'python -m sglang.launch_server \\\n';
      cmd += `  --model ${modelName}`;
      if (hwConfig.tp > 1) {
        cmd += ` \\\n  --tp ${hwConfig.tp}`;
      }
    }
    for (const [key, option] of Object.entries(options)) {
      if (key === 'hardware' || key === 'quantization') continue;
      if (option.commandRule) {
        const rule = option.commandRule(values[key]);
        if (rule) {
          cmd += ` \\\n  ${rule}`;
        }
      }
    }
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      let items = option.items || [];
      if (option.getDynamicItems) {
        const defaultValues = {};
        Object.entries(options).forEach(([innerKey, innerOption]) => {
          if (innerOption.type === 'checkbox') {
            defaultValues[innerKey] = (innerOption.items || []).filter(item => item.default).map(item => item.id);
          } else if (innerOption.type === 'text') {
            defaultValues[innerKey] = innerOption.default || '';
          } else if (innerOption.items && innerOption.items.length > 0) {
            const defaultItem = innerOption.items.find(item => item.default);
            defaultValues[innerKey] = defaultItem ? defaultItem.id : innerOption.items[0].id;
          }
        });
        items = option.getDynamicItems(defaultValues);
      }
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[Glyph](https://huggingface.co/zai-org/Glyph) is a powerful language model developed by Zhipu AI, featuring advanced capabilities in reasoning, function calling, and multi-modal understanding.

**Hardware Support:** NVIDIA B200/H100/H200, AMD MI300X/MI325X/MI355X

**Key Features:**

* **Advanced Reasoning**: Built-in reasoning capabilities for complex problem-solving
* **Multiple Quantizations**: BF16 and FP8 variants for different performance/memory trade-offs
* **High Performance**: Optimized for both throughput and latency scenarios

**Available Models:**

* **BF16 (Full precision)**: [zai-org/Glyph](https://huggingface.co/zai-org/Glyph)
* **FP8 (8-bit quantized)**: [zai-org/Glyph-FP8](https://huggingface.co/zai-org/Glyph-FP8)

**License:**

Please refer to the [official Glyph model card](https://huggingface.co/zai-org/Glyph) for license details.

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

## 3. Model Deployment

This section provides deployment configurations optimized for different hardware platforms and use cases.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, quantization method, and other options.

<GLMGlyphDeployment />

### 3.2 Configuration Tips

* **Thinking Budget:** Use `--enable-custom-logit-processor` flag and pass `Glm4MoeThinkingBudgetLogitProcessor` in requests to cap the model's thinking token count. See the [GLM-4.5 cookbook page](/cookbook/autoregressive/GLM/GLM-4.5) for the full Thinking Budget usage example.

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)

### 4.2 Advanced Usage

#### 4.2.1 Thinking Mode

Glyph supports thinking mode for enhanced reasoning. Enable the reasoning parser during deployment to separate the thinking and content sections:

```shell Command theme={null}
python -m sglang.launch_server \
  --model-path zai-org/Glyph \
  --reasoning-parser glm45 \
  --tp 4
```

**Streaming with Thinking Process:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Enable streaming to see the thinking process in real-time
response = client.chat.completions.create(
    model="zai-org/Glyph",
    messages=[
        {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
    ],
    temperature=0.7,
    max_tokens=2048,
    stream=True
)

# Process the stream
has_thinking = False
has_answer = False
thinking_started = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print answer content
        if delta.content:
            # Close thinking section and add content header
            if has_thinking and not has_answer:
                print("\n=============== Content =================", flush=True)
                has_answer = True
            print(delta.content, end="", flush=True)

print()
```

**Note:** The reasoning parser captures the model's step-by-step thinking process, allowing you to see how the model arrives at its conclusions.

**Disable Thinking Mode:**

To disable thinking mode for a specific request:

```python Example theme={null}
response = client.chat.completions.create(
    model="zai-org/Glyph",
    messages=[{"role": "user", "content": "What is the capital of France?"}],
    extra_body={"chat_template_kwargs": {"enable_thinking": False}}
)
```

#### 4.2.2 Tool Calling

Glyph supports tool calling capabilities. Enable the tool call parser:

```shell Command theme={null}
python -m sglang.launch_server \
  --model-path zai-org/Glyph \
  --reasoning-parser glm45 \
  --tool-call-parser glm45 \
  --tp 4
```

**Python Example (with Thinking Process):**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Make request with streaming to see thinking process
response = client.chat.completions.create(
    model="zai-org/Glyph",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    temperature=0.7,
    stream=True
)

# Process streaming response
thinking_started = False
has_thinking = False
tool_calls_accumulator = {}

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Accumulate tool calls
        if hasattr(delta, 'tool_calls') and delta.tool_calls:
            # Close thinking section if needed
            if has_thinking and thinking_started:
                print("\n=============== Content =================\n", flush=True)
                thinking_started = False

            for tool_call in delta.tool_calls:
                index = tool_call.index
                if index not in tool_calls_accumulator:
                    tool_calls_accumulator[index] = {
                        'name': None,
                        'arguments': ''
                    }

                if tool_call.function:
                    if tool_call.function.name:
                        tool_calls_accumulator[index]['name'] = tool_call.function.name
                    if tool_call.function.arguments:
                        tool_calls_accumulator[index]['arguments'] += tool_call.function.arguments

        # Print content
        if delta.content:
            print(delta.content, end="", flush=True)

# Print accumulated tool calls
for index, tool_call in sorted(tool_calls_accumulator.items()):
    print(f"Tool Call: {tool_call['name']}")
    print(f"   Arguments: {tool_call['arguments']}")

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
The user is asking about the weather in Beijing. I need to use the get_weather function to retrieve this information.
I should call the function with location="Beijing".
=============== Content =================

Tool Call: get_weather
   Arguments: {"location": "Beijing", "unit": "celsius"}
```

**Note:**

* The reasoning parser shows how the model decides to use a tool
* Tool calls are clearly marked with the function name and arguments
* You can then execute the function and send the result back to continue the conversation

**Handling Tool Call Results:**

```python Example theme={null}
# After getting the tool call, execute the function
def get_weather(location, unit="celsius"):
    # Your actual weather API call here
    return f"The weather in {location} is 22°{unit[0].upper()} and sunny."

# Send tool result back to the model
messages = [
    {"role": "user", "content": "What's the weather in Beijing?"},
    {
        "role": "assistant",
        "content": None,
        "tool_calls": [{
            "id": "call_123",
            "type": "function",
            "function": {
                "name": "get_weather",
                "arguments": '{"location": "Beijing", "unit": "celsius"}'
            }
        }]
    },
    {
        "role": "tool",
        "tool_call_id": "call_123",
        "content": get_weather("Beijing", "celsius")
    }
]

final_response = client.chat.completions.create(
    model="zai-org/Glyph",
    messages=messages,
    temperature=0.7
)

print(final_response.choices[0].message.content)
# Output: "The weather in Beijing is currently 22°C and sunny."
```

## 5. Benchmark

This section uses **industry-standard configurations** for comparable benchmark results.

### 5.1 Speed Benchmark

**Test Environment:**

* Model: Glyph
* SGLang Version: 0.5.6.post1

**Benchmark Methodology:**

We use industry-standard benchmark configurations to ensure results are comparable across frameworks and hardware platforms.

#### 5.1.1 Standard Scenario Benchmark

* **Model Deployment**

```bash Command theme={null}
python -m sglang.launch_server \
  --model zai-org/Glyph \
  --tp 2
```

##### 5.1.1.1 Low Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  17.03
Total input tokens:                      6101
Total input text tokens:                 6101
Total input vision tokens:               0
Total generated tokens:                  4220
Total generated tokens (retokenized):    4220
Request throughput (req/s):              0.59
Input token throughput (tok/s):          358.17
Output token throughput (tok/s):         247.74
Peak output token throughput (tok/s):    251.00
Peak concurrent requests:                3
Total token throughput (tok/s):          605.91
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   1702.14
Median E2E Latency (ms):                 1361.72
---------------Time to First Token----------------
Mean TTFT (ms):                          22.35
Median TTFT (ms):                        22.61
P99 TTFT (ms):                           23.76
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          3.99
Median TPOT (ms):                        3.99
P99 TPOT (ms):                           4.01
---------------Inter-Token Latency----------------
Mean ITL (ms):                           3.99
Median ITL (ms):                         3.99
P95 ITL (ms):                            4.03
P99 ITL (ms):                            4.12
Max ITL (ms):                            7.46
==================================================
```

##### 5.1.1.2 Medium Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  16.27
Total input tokens:                      39668
Total input text tokens:                 39668
Total input vision tokens:               0
Total generated tokens:                  40805
Total generated tokens (retokenized):    40804
Request throughput (req/s):              4.92
Input token throughput (tok/s):          2438.06
Output token throughput (tok/s):         2507.94
Peak output token throughput (tok/s):    3069.00
Peak concurrent requests:                26
Total token throughput (tok/s):          4946.00
Concurrency:                             13.44
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   2733.43
Median E2E Latency (ms):                 2892.98
---------------Time to First Token----------------
Mean TTFT (ms):                          33.10
Median TTFT (ms):                        27.73
P99 TTFT (ms):                           49.34
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          5.33
Median TPOT (ms):                        5.39
P99 TPOT (ms):                           5.86
---------------Inter-Token Latency----------------
Mean ITL (ms):                           5.30
Median ITL (ms):                         4.89
P95 ITL (ms):                            5.54
P99 ITL (ms):                            21.17
Max ITL (ms):                            25.14
==================================================
```

##### 5.1.1.3 High Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 500 \
  --max-concurrency 100 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     500
Benchmark duration (s):                  25.67
Total input tokens:                      249831
Total input text tokens:                 249831
Total input vision tokens:               0
Total generated tokens:                  252662
Total generated tokens (retokenized):    252657
Request throughput (req/s):              19.48
Input token throughput (tok/s):          9733.69
Output token throughput (tok/s):         9843.99
Peak output token throughput (tok/s):    13398.00
Peak concurrent requests:                127
Total token throughput (tok/s):          19577.68
Concurrency:                             89.49
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   4593.75
Median E2E Latency (ms):                 4431.03
---------------Time to First Token----------------
Mean TTFT (ms):                          48.66
Median TTFT (ms):                        35.88
P99 TTFT (ms):                           120.61
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          9.10
Median TPOT (ms):                        9.55
P99 TPOT (ms):                           11.00
---------------Inter-Token Latency----------------
Mean ITL (ms):                           9.01
Median ITL (ms):                         6.51
P95 ITL (ms):                            23.19
P99 ITL (ms):                            25.54
Max ITL (ms):                            52.93
==================================================
```

#### 5.1.2 Reasoning Scenario Benchmark

##### 5.1.2.1 Low Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  201.53
Total input tokens:                      6101
Total input text tokens:                 6101
Total input vision tokens:               0
Total generated tokens:                  44462
Total generated tokens (retokenized):    44455
Request throughput (req/s):              0.05
Input token throughput (tok/s):          30.27
Output token throughput (tok/s):         220.63
Peak output token throughput (tok/s):    251.00
Peak concurrent requests:                2
Total token throughput (tok/s):          250.90
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   20151.45
Median E2E Latency (ms):                 21576.31
---------------Time to First Token----------------
Mean TTFT (ms):                          2362.23
Median TTFT (ms):                        23.03
P99 TTFT (ms):                           21310.14
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          4.00
Median TPOT (ms):                        4.00
P99 TPOT (ms):                           4.01
---------------Inter-Token Latency----------------
Mean ITL (ms):                           4.00
Median ITL (ms):                         4.00
P95 ITL (ms):                            4.05
P99 ITL (ms):                            4.08
Max ITL (ms):                            5.67
==================================================
```

##### 5.1.2.2 Medium Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  118.67
Total input tokens:                      39668
Total input text tokens:                 39668
Total input vision tokens:               0
Total generated tokens:                  318306
Total generated tokens (retokenized):    318270
Request throughput (req/s):              0.67
Input token throughput (tok/s):          334.27
Output token throughput (tok/s):         2682.26
Peak output token throughput (tok/s):    3264.00
Peak concurrent requests:                19
Total token throughput (tok/s):          3016.53
Concurrency:                             13.74
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   20387.23
Median E2E Latency (ms):                 20466.09
---------------Time to First Token----------------
Mean TTFT (ms):                          132.47
Median TTFT (ms):                        27.19
P99 TTFT (ms):                           583.15
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          5.09
Median TPOT (ms):                        5.13
P99 TPOT (ms):                           5.19
---------------Inter-Token Latency----------------
Mean ITL (ms):                           5.09
Median ITL (ms):                         5.08
P95 ITL (ms):                            5.18
P99 ITL (ms):                            5.57
Max ITL (ms):                            522.26
==================================================
```

##### 5.1.2.3 High Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 8000 \
  --num-prompts 320 \
  --max-concurrency 64 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 64
Successful requests:                     320
Benchmark duration (s):                  150.00
Total input tokens:                      158939
Total input text tokens:                 158939
Total input vision tokens:               0
Total generated tokens:                  1301025
Total generated tokens (retokenized):    1300901
Request throughput (req/s):              2.13
Input token throughput (tok/s):          1059.59
Output token throughput (tok/s):         8673.49
Peak output token throughput (tok/s):    11899.00
Peak concurrent requests:                71
Total token throughput (tok/s):          9733.09
Concurrency:                             54.71
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   25645.42
Median E2E Latency (ms):                 26913.26
---------------Time to First Token----------------
Mean TTFT (ms):                          163.75
Median TTFT (ms):                        93.67
P99 TTFT (ms):                           426.19
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          6.27
Median TPOT (ms):                        6.39
P99 TPOT (ms):                           6.59
---------------Inter-Token Latency----------------
Mean ITL (ms):                           6.27
Median ITL (ms):                         0.17
P95 ITL (ms):                            32.94
P99 ITL (ms):                            67.89
Max ITL (ms):                            136.00
==================================================
```

#### 5.1.3 Summarization Scenario Benchmark

#### 5.1.3.1 Low Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  17.44
Total input tokens:                      41941
Total input text tokens:                 41941
Total input vision tokens:               0
Total generated tokens:                  4220
Total generated tokens (retokenized):    4220
Request throughput (req/s):              0.57
Input token throughput (tok/s):          2405.19
Output token throughput (tok/s):         242.00
Peak output token throughput (tok/s):    250.00
Peak concurrent requests:                2
Total token throughput (tok/s):          2647.19
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   1742.54
Median E2E Latency (ms):                 1412.47
---------------Time to First Token----------------
Mean TTFT (ms):                          53.48
Median TTFT (ms):                        45.05
P99 TTFT (ms):                           98.57
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          4.01
Median TPOT (ms):                        4.01
P99 TPOT (ms):                           4.03
---------------Inter-Token Latency----------------
Mean ITL (ms):                           4.01
Median ITL (ms):                         4.01
P95 ITL (ms):                            4.06
P99 ITL (ms):                            4.09
Max ITL (ms):                            4.95
==================================================
```

##### 5.1.3.2 Medium Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  16.90
Total input tokens:                      300020
Total input text tokens:                 300020
Total input vision tokens:               0
Total generated tokens:                  41669
Total generated tokens (retokenized):    41668
Request throughput (req/s):              4.73
Input token throughput (tok/s):          17753.58
Output token throughput (tok/s):         2465.75
Peak output token throughput (tok/s):    3005.00
Peak concurrent requests:                25
Total token throughput (tok/s):          20219.33
Concurrency:                             13.68
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   2890.33
Median E2E Latency (ms):                 3069.55
---------------Time to First Token----------------
Mean TTFT (ms):                          41.46
Median TTFT (ms):                        31.75
P99 TTFT (ms):                           93.18
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          5.52
Median TPOT (ms):                        5.58
P99 TPOT (ms):                           6.14
---------------Inter-Token Latency----------------
Mean ITL (ms):                           5.48
Median ITL (ms):                         5.13
P95 ITL (ms):                            5.93
P99 ITL (ms):                            20.76
Max ITL (ms):                            36.01
==================================================
```

##### 5.1.3.3 High Concurrency

* **Benchmark Command**:

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --model zai-org/Glyph \
  --dataset-name random \
  --random-input-len 8000 \
  --random-output-len 1000 \
  --num-prompts 320 \
  --max-concurrency 64 \
  --request-rate inf
```

* **Test Results**:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 64
Successful requests:                     320
Benchmark duration (s):                  35.54
Total input tokens:                      1273893
Total input text tokens:                 1273893
Total input vision tokens:               0
Total generated tokens:                  170000
Total generated tokens (retokenized):    169994
Request throughput (req/s):              9.01
Input token throughput (tok/s):          35848.57
Output token throughput (tok/s):         4783.96
Peak output token throughput (tok/s):    8396.00
Peak concurrent requests:                80
Total token throughput (tok/s):          40632.53
Concurrency:                             59.26
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   6580.96
Median E2E Latency (ms):                 6248.74
---------------Time to First Token----------------
Mean TTFT (ms):                          345.27
Median TTFT (ms):                        96.06
P99 TTFT (ms):                           2823.92
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          12.26
Median TPOT (ms):                        12.53
P99 TPOT (ms):                           23.58
---------------Inter-Token Latency----------------
Mean ITL (ms):                           11.76
Median ITL (ms):                         6.57
P95 ITL (ms):                            27.66
P99 ITL (ms):                            91.24
Max ITL (ms):                            2609.64
==================================================
```

### 5.2 Accuracy Benchmark

Document model accuracy on standard benchmarks:

#### 5.2.1 GSM8K Benchmark

* Benchmark Command

```bash Command theme={null}
python -m sglang.test.few_shot_gsm8k \
  --num-questions 200
```

* Test Result

```text Output theme={null}
Accuracy: 0.890
Invalid: 0.000
Latency: 3.718 s
Output throughput: 5245.606 token/s
```