> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# GLM-OCR

export const GLMOCRDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h100',
        label: 'H100',
        default: true
      }, {
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'b200',
        label: 'B200',
        default: false
      }]
    },
    strategy: {
      name: 'strategy',
      title: 'Deployment Strategy',
      type: 'checkbox',
      items: [{
        id: 'mtp',
        label: 'MTP',
        subtitle: 'Multi-token Prediction',
        default: true
      }]
    }
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = option.items.filter(item => item.default).map(item => item.id);
      } else {
        const defaultItem = option.items.find(item => item.default);
        initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
      }
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      } else {
        return {
          ...prev,
          [optionName]: currentValues.filter(id => id !== itemId)
        };
      }
    });
  };
  const generateCommand = () => {
    const {strategy} = values;
    const strategyArray = Array.isArray(strategy) ? strategy : [];
    const modelName = 'zai-org/GLM-OCR';
    let cmd = 'SGLANG_USE_CUDA_IPC_TRANSPORT=1 python -m sglang.launch_server \\\n';
    cmd += `  --model ${modelName}`;
    if (strategyArray.includes('mtp')) {
      cmd += ` \\\n  --speculative-algorithm EAGLE`;
      cmd += ` \\\n  --speculative-num-steps 3`;
      cmd += ` \\\n  --speculative-eagle-topk 1`;
      cmd += ` \\\n  --speculative-num-draft-tokens 4`;
    }
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.type === 'checkbox' ? option.items.map(item => {
    const isChecked = (values[option.name] || []).includes(item.id);
    const isDisabled = item.required;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                    <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={e => handleCheckboxChange(option.name, item.id, e.target.checked)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  }) : option.items.map(item => {
    const isChecked = values[option.name] === item.id;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {}
    }}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} onChange={() => handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[GLM-OCR](https://huggingface.co/zai-org/GLM-OCR) is a multimodal OCR model for complex document understanding, built on the GLM-V encoder–decoder architecture. It introduces Multi-Token Prediction (MTP) loss and stable full-task reinforcement learning to improve training efficiency, recognition accuracy, and generalization.

The model integrates the CogViT visual encoder pre-trained on large-scale image–text data, a lightweight cross-modal connector with efficient token downsampling, and a GLM-0.5B language decoder. Combined with a two-stage pipeline of layout analysis and parallel recognition based on PP-DocLayout-V3, GLM-OCR delivers robust and high-quality OCR performance across diverse document layouts.

**Hardware Support:** NVIDIA B200/H100/H200

**Key Features:**

* **State-of-the-Art Performance**: Achieves 94.62 on OmniDocBench V1.5, ranking #1, and delivers SOTA results across major document understanding benchmarks, including formula recognition, table recognition, and information extraction.
* **Optimized for Real-World Scenarios**: Specifically optimized for practical business cases, maintaining stable and accurate performance on complex tables, code documents, seals, and other challenging layouts.
* **Efficient Inference**: With only 0.9B parameters, GLM-OCR supports deployment via vLLM and SGLang, significantly reducing inference latency and compute cost—well suited for high-concurrency and edge deployments.
* **Easy to Use**: Fully open-sourced with a complete SDK and inference toolchain, enabling one-line invocation and seamless integration into existing systems.

For more details, please refer to the [official GLM-OCR model card](https://huggingface.co/zai-org/GLM-OCR).

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

## 3. Model Deployment

This section provides deployment configurations optimized for different hardware platforms and use cases.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform and deployment options. You can optionally enable MTP (Multi-Token Prediction) for faster inference using EAGLE speculative decoding.

<GLMOCRDeployment />

### 3.2 Configuration Tips

* **CUDA IPC Transport**: The `SGLANG_USE_CUDA_IPC_TRANSPORT=1` environment variable enables CUDA IPC for transferring multimodal features, which significantly improves TTFT.
* **MTP (Multi-Token Prediction)**: Enable MTP to use EAGLE speculative decoding for faster inference. This feature predicts multiple tokens at once to reduce latency.
* **Memory Management**: For memory-constrained environments, you may need to adjust `--mem-fraction-static` and/or `--max-running-requests`.

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)
* [SGLang OpenAI Vision API Guide](../../../docs/basic_usage/openai_api_vision)

### 4.2 Advanced Usage

#### 4.2.1 OCR Image Processing

GLM-OCR supports OCR tasks on various document types. Here's a basic example:

```python Example theme={null}
import time
from openai import OpenAI

client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:30000/v1",
    timeout=3600
)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {
                    "url": "https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png"
                }
            },
            {
                "type": "text",
                "text": "Please extract all text from this image."
            }
        ]
    }
]

start = time.time()
response = client.chat.completions.create(
    model="zai-org/GLM-OCR",
    messages=messages,
    max_tokens=2048
)
print(f"Response costs: {time.time() - start:.2f}s")
print(f"Generated text: {response.choices[0].message.content}")
```

**Example Output:**

```text Output theme={null}
Response costs: 2.29s
Generated text: CINNAMON SUGAR
1 x 17,000 17,000

SUB TOTAL 17,000

GRAND TOTAL 17,000

CASH IDR 20,000

CHANGE DUE 3,000

```

#### 4.2.2 Complex Document Processing

GLM-OCR excels at processing complex documents including:

* **Tables**: Accurate extraction of tabular data with structure preservation
* **Formulas**: Mathematical formula recognition
* **Code Documents**: Source code extraction from screenshots
* **Seals and Stamps**: Recognition of seals and stamps in documents
* **Multi-layout Documents**: Mixed content with text, images, and tables

```python Example theme={null}
import time
from openai import OpenAI

client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:30000/v1",
    timeout=3600
)

# Example: Processing a document with tables
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {
                    "url": "YOUR_DOCUMENT_IMAGE_URL"
                }
            },
            {
                "type": "text",
                "text": "Please extract the table content from this document and format it as markdown."
            }
        ]
    }
]

response = client.chat.completions.create(
    model="zai-org/GLM-OCR",
    messages=messages,
    max_tokens=4096
)
print(response.choices[0].message.content)
```

## 5. Benchmark

### 5.1 Accuracy Benchmark

Document model accuracy on standard benchmarks:

#### 5.1.1 OCRBench Benchmark

* Benchmark Command

```bash Command theme={null}
python3 -m lmms_eval \
  --model openai_compatible \
  --model_args "model_version=zai-org/GLM-OCR" \
  --tasks ocrbench \
  --batch_size 128 \
  --log_samples \
  --log_samples_suffix "openai_compatible" \
  --output_path ./logs
```

* Test Result

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <colgroup>
    <col style={{width: "12.5%"}} />

    <col style={{width: "12.5%"}} />

    <col style={{width: "12.5%"}} />

    <col style={{width: "12.5%"}} />

    <col style={{width: "12.5%"}} />

    <col style={{width: "12.5%"}} />

    <col style={{width: "12.5%"}} />

    <col style={{width: "12.5%"}} />
  </colgroup>

  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Tasks</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Version</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Filter</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>n-shot</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Metric</th>

      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}} />

      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Value</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Stderr</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>ocrbench</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Yaml</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>none</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>0</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>ocrbench\_accuracy</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>↑</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>0.806</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>N/A</td>
    </tr>
  </tbody>
</table>

#### 5.1.2 OmniDocBench V1.5

GLM-OCR achieves **94.62** on OmniDocBench V1.5, ranking #1 among all models, demonstrating state-of-the-art performance across major document understanding benchmarks.
