> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# DeepSeek-OCR-2

export const DeepSeekOCR2Deployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h200',
        label: 'H200',
        default: true
      }, {
        id: 'b200',
        label: 'B200',
        default: false
      }, {
        id: 'mi300x',
        label: 'MI300X',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }, {
        id: 'xeon',
        label: 'XEON',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      items: [{
        id: 'fp16',
        label: 'FP16',
        default: true
      }]
    },
    strategy: {
      name: 'strategy',
      title: 'Deployment Strategy',
      type: 'checkbox',
      items: [{
        id: 'tp',
        label: 'TP',
        subtitle: 'Tensor Parallel',
        default: true,
        required: true
      }, {
        id: 'dp',
        label: 'DP',
        subtitle: 'Data Parallel',
        default: false,
        disabledWhen: v => v.hardware === 'xeon',
        disabledReason: 'Intel Xeon CPUs only support Tensor Parallel (TP)'
      }, {
        id: 'ep',
        label: 'EP',
        subtitle: 'Expert Parallel',
        default: false,
        disabledWhen: v => v.hardware === 'xeon',
        disabledReason: 'Intel Xeon CPUs only support Tensor Parallel (TP)'
      }]
    }
  };
  const generateCommand = values => {
    const {hardware, strategy} = values;
    const strategyArray = Array.isArray(strategy) ? strategy : [];
    let modelPath = 'deepseek-ai/DeepSeek-OCR-2';
    let cmd = 'sglang serve \\\n';
    cmd += `  --model-path ${modelPath}`;
    if (hardware === 'xeon') {
      cmd += ` \\\n  --device cpu \\\n  --disable-overlap-schedule \\\n  --trust-remote-code`;
    }
    cmd += ` \\\n  --enable-multimodal`;
    if (strategyArray.includes('tp')) {
      cmd += ` \\\n  --tp 1`;
    }
    if (strategyArray.includes('dp')) {
      cmd += ` \\\n  --dp 1 \\\n  --enable-dp-attention`;
    }
    if (strategyArray.includes('ep')) {
      cmd += ` \\\n  --ep 1`;
    }
    if (hardware === 'mi300x' || hardware === 'mi325x' || hardware === 'mi355x') {
      cmd += ` \\\n  --attention-backend triton` + ` \\\n  --trust-remote-code`;
    }
    cmd += ` \\\n  --host 0.0.0.0 \\\n  --port 30000`;
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      let items = option.items || [];
      if (option.getDynamicItems) {
        const defaultValues = {};
        Object.entries(options).forEach(([innerKey, innerOption]) => {
          if (innerOption.type === 'checkbox') {
            defaultValues[innerKey] = (innerOption.items || []).filter(item => item.default).map(item => item.id);
          } else if (innerOption.type === 'text') {
            defaultValues[innerKey] = innerOption.default || '';
          } else if (innerOption.items && innerOption.items.length > 0) {
            const defaultItem = innerOption.items.find(item => item.default);
            defaultValues[innerKey] = defaultItem ? defaultItem.id : innerOption.items[0].id;
          }
        });
        items = option.getDynamicItems(defaultValues);
      }
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => {
      const next = {
        ...prev,
        [optionName]: value
      };
      if (optionName === 'hardware') {
        const strategyItems = options.strategy.items || [];
        const current = Array.isArray(next.strategy) ? next.strategy : [];
        next.strategy = current.filter(id => {
          const item = strategyItems.find(s => s.id === id);
          if (!item) return false;
          if (typeof item.disabledWhen === 'function' && item.disabledWhen(next)) return false;
          return true;
        });
      }
      return next;
    });
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[DeepSeek-OCR-2](https://github.com/deepseek-ai/DeepSeek-OCR-2) is DeepSeek's next-generation OCR (Optical Character Recognition) model, building on DeepSeek-OCR with improved accuracy and broader document understanding capabilities. The model is optimized for high-accuracy text extraction from images across a wide variety of document types and formats.

**Key Features:**

* **Semantic-Aware Visual Encoding (DeepEncoder V2)**: DeepSeek-OCR-2 introduces DeepEncoder V2, which models document reading order in a more human-like, semantic-driven manner rather than relying on fixed raster scanning. This significantly improves logical reading flow in complex layouts (e.g., multi-column documents).
* **Stronger Layout and Structural Understanding**: DeepSeek-OCR-2 demonstrates improved performance on structured documents such as tables, forms, and dense multi-column pages. It reduces reading-order errors and improves overall document parsing robustness compared to the original version.
* **Improved Accuracy While Maintaining Token Efficiency**: The original DeepSeek-OCR emphasized aggressive visual token compression. OCR-2 maintains high token efficiency while delivering higher benchmark performance, particularly on document-level understanding tasks.
* **Better Generalization Across Complex Document Tasks**: DeepSeek-OCR-2 performs more consistently across multilingual documents, structured data extraction, and visually complex content, making it more suitable for real-world document intelligence scenarios beyond plain text OCR.

**Available Models:**

* **Base Model**: [deepseek-ai/DeepSeek-OCR-2](https://huggingface.co/deepseek-ai/DeepSeek-OCR-2) - Recommended for OCR tasks

**License:**
To use DeepSeek-OCR-2, you must agree to DeepSeek's Community License. See [LICENSE](https://huggingface.co/deepseek-ai/DeepSeek-OCR-2/blob/main/LICENSE.txt) for details.

For more details, please refer to the [official DeepSeek-OCR-2 repository](https://github.com/deepseek-ai/DeepSeek-OCR-2).

## 2. SGLang Installation

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

For SGLang CPU installation, please refer to the [CPU version installation guide](../../../docs/hardware-platforms/cpu_server#installation).

## 3. Model Deployment

This section provides deployment configurations optimized for different hardware platforms and use cases.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, quantization method, and deployment strategy. SGLang supports serving DeepSeek-OCR-2 on NVIDIA H200 and B200, AMD MI300X, MI355X, and MI325X GPUs, as well as Intel Xeon CPUs.

<DeepSeekOCR2Deployment />

**Note**: DeepSeek-OCR-2 has \~3B parameters and easily fits on a single modern GPU. For low-latency serving, no model parallelism is needed. For high-throughput requirements, consider using data parallelism with the SGLang Model Gateway — see [DP, DPA and SGLang DP Router](../../../docs/advanced_features/sgl_model_gateway) for more details.

### 3.2 Configuration Tips

* **Single GPU Deployment:** DeepSeek-OCR-2 (\~3B parameters) fits on a single modern GPU — no tensor parallelism required for low-latency serving.
* **High Throughput:** For high-throughput scenarios, use data parallelism with the SGLang Model Gateway. See [DP, DPA and SGLang DP Router](../../../docs/advanced_features/sgl_model_gateway).
* **NCCL timeout:** If model loading is slow, increase `--dist-timeout 3600`.
* For configuring CPU service, please refer to the `Notes` part in the serving engine launching section in [the SGLang CPU server document](../../../docs/hardware-platforms/cpu_server#launch-of-the-serving-engine) to better understand how to configure the arguments, especially for NUMA binding settings.

## 4. Model Invocation

### 4.1 Basic Usage

**OpenAI-compatible request example**

```python Example theme={null}
import requests

url = "http://localhost:30000/v1/chat/completions"

data = {
    "model": "deepseek-ai/DeepSeek-OCR-2",
    "messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "<image>\n<|grounding|>Convert the document to markdown."},
                {"type": "image_url", "image_url": {"url": "https://example.com/your_image.jpg"}},
            ],
        }
    ],
    "max_tokens": 512,
}

response = requests.post(url, json=data)
print(response.text)
```

**Reference**

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)

### 4.2 Recommended Prompts

The following prompts are recommended by the [official model card](https://huggingface.co/deepseek-ai/DeepSeek-OCR-2#main-prompts).

**Structured document conversion** — extracts text while preserving layout:

```text Example theme={null}
<image>
<|grounding|>Convert the document to markdown.
```

**Free-form OCR** — extracts without layouts:

```text Example theme={null}
<image>
Free OCR.
```

## 5. Benchmark

### 5.1 Speed Benchmark

**Test Environment:**

* Hardware: NVIDIA H200 GPU (1x)
* Model: DeepSeek-OCR-2
* Tensor Parallelism: 1
* sglang version: 0.0.0.dev1+g93fca0bbc

We use SGLang's built-in benchmarking tool to conduct performance evaluation on the [ShareGPT\_Vicuna\_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. This dataset contains real conversation data and can better reflect performance in actual use scenarios. To simulate real-world usage patterns, we configure each request with 1024 input tokens and 1024 output tokens, representing typical medium-length conversations with detailed responses. For more details on how to perform evaluation, see [Evaluating New Models with SGLang](../../../docs/developer_guide/evaluating_new_models).

#### 5.1.1 Latency-Sensitive Benchmark

* Model Deployment Command:

```shell Command theme={null}
sglang serve \
  --model-path deepseek-ai/DeepSeek-OCR-2 \
  --enable-multimodal \
  --host 0.0.0.0 \
  --port 30000
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 0.0.0.0 \
  --port 30000 \
  --model deepseek-ai/DeepSeek-OCR-2 \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 10 \
  --max-concurrency 1
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  3.54
Total input tokens:                      1972
Total input text tokens:                 1972
Total generated tokens:                  2784
Total generated tokens (retokenized):    2710
Request throughput (req/s):              2.83
Input token throughput (tok/s):          557.53
Output token throughput (tok/s):         787.10
Peak output token throughput (tok/s):    818.00
Peak concurrent requests:                5
Total token throughput (tok/s):          1344.63
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   352.69
Median E2E Latency (ms):                 392.34
P90 E2E Latency (ms):                    540.64
P99 E2E Latency (ms):                    639.01
---------------Time to First Token----------------
Mean TTFT (ms):                          18.08
Median TTFT (ms):                        16.57
P99 TTFT (ms):                           25.67
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          1.18
Median TPOT (ms):                        1.21
P99 TPOT (ms):                           1.22
---------------Inter-Token Latency----------------
Mean ITL (ms):                           1.21
Median ITL (ms):                         1.21
P95 ITL (ms):                            1.28
P99 ITL (ms):                            1.44
Max ITL (ms):                            4.32
==================================================
```

#### 5.1.2 Throughput-Sensitive Benchmark

* Model Deployment Command:

```shell Command theme={null}
sglang serve \
  --model-path deepseek-ai/DeepSeek-OCR-2 \
  --enable-multimodal \
  --tp 1 \
  --ep 1 \
  --dp 1 \
  --enable-dp-attention \
  --host 0.0.0.0 \
  --port 30000
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 0.0.0.0 \
  --port 30000 \
  --model deepseek-ai/DeepSeek-OCR-2 \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 1000 \
  --max-concurrency 100
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     1000
Benchmark duration (s):                  14.79
Total input tokens:                      301698
Total input text tokens:                 301698
Total generated tokens:                  188375
Total generated tokens (retokenized):    185236
Request throughput (req/s):              67.63
Input token throughput (tok/s):          20402.54
Output token throughput (tok/s):         12738.99
Peak output token throughput (tok/s):    17508.00
Peak concurrent requests:                187
Total token throughput (tok/s):          33141.53
Concurrency:                             86.87
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   1284.50
Median E2E Latency (ms):                 866.07
P90 E2E Latency (ms):                    3027.32
P99 E2E Latency (ms):                    5490.63
---------------Time to First Token----------------
Mean TTFT (ms):                          86.08
Median TTFT (ms):                        50.09
P99 TTFT (ms):                           613.92
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          7.79
Median TPOT (ms):                        6.54
P99 TPOT (ms):                           50.10
---------------Inter-Token Latency----------------
Mean ITL (ms):                           6.42
Median ITL (ms):                         4.64
P95 ITL (ms):                            23.65
P99 ITL (ms):                            39.62
Max ITL (ms):                            452.65
==================================================
```