> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Kimi-Linear

export const KimiLinearDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'mi300x',
        label: 'MI300x',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325x',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355x',
        default: false
      }]
    },
    modelname: {
      name: 'modelname',
      title: 'Model Name',
      items: [{
        id: 'instruct',
        label: 'Kimi-Linear-48B-A3B-Instruct',
        default: true
      }]
    },
    strategy: {
      name: 'strategy',
      title: 'Deployment Strategy',
      type: 'checkbox',
      items: [{
        id: 'tp',
        label: 'TP',
        default: true,
        required: true
      }]
    },
    reasoning: {
      name: 'reasoning',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    }
  };
  const generateCommand = values => {
    const {hardware, modelname, strategy, reasoning, toolcall} = values;
    if (modelname === 'instruct' && reasoning === 'enabled') {
      return `# Error: Kimi-Linear doesn't support reasoning parser\n# Please select "Disabled" for Reasoning Parser or choose Kimi-Linear-Thinking model`;
    }
    const modelMap = {
      'instruct': 'moonshotai/Kimi-Linear-48B-A3B-Instruct'
    };
    const modelName = modelMap[modelname];
    let cmd = 'python3 -m sglang.launch_server \\\n';
    if (hardware === 'mi300x' || hardware === 'mi325x' || hardware === 'mi355x') {
      cmd = 'SGLANG_ROCM_FUSED_DECODE_MLA=0 ' + cmd;
    }
    cmd += `  --model-path ${modelName}`;
    cmd += ` \\\n  --tp 4`;
    cmd += ` \\\n  --trust-remote-code`;
    if (toolcall === 'enabled') {
      cmd += ` \\\n  --tool-call-parser kimi_k2`;
    }
    if (reasoning === 'enabled') {
      cmd += ` \\\n  --reasoning-parser kimi_k2`;
    }
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      let items = option.items || [];
      if (option.getDynamicItems) {
        const defaultValues = {};
        Object.entries(options).forEach(([innerKey, innerOption]) => {
          if (innerOption.type === 'checkbox') {
            defaultValues[innerKey] = (innerOption.items || []).filter(item => item.default).map(item => item.id);
          } else if (innerOption.type === 'text') {
            defaultValues[innerKey] = innerOption.default || '';
          } else if (innerOption.items && innerOption.items.length > 0) {
            const defaultItem = innerOption.items.find(item => item.default);
            defaultValues[innerKey] = defaultItem ? defaultItem.id : innerOption.items[0].id;
          }
        });
        items = option.getDynamicItems(defaultValues);
      }
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## AMD GPU Support

## 1. Model Introduction

Kimi Linear is a hybrid linear attention architecture that outperforms traditional full attention methods across various contexts, including short, long, and reinforcement learning (RL) scaling regimes. At its core is Kimi Delta Attention (KDA)—a refined version of Gated DeltaNet that introduces a more efficient gating mechanism to optimize the use of finite-state RNN memory.

This generation delivers comprehensive upgrades across the board:

Kimi Delta Attention (KDA): A linear attention mechanism that refines the gated delta rule with finegrained gating.
Hybrid Architecture: A 3:1 KDA-to-global MLA ratio reduces memory usage while maintaining or surpassing the quality of full attention.
Superior Performance: Outperforms full attention in a variety of tasks, including long-context and RL-style benchmarks on 1.4T token training runs with fair comparisons.
High Throughput: Achieves up to 6× faster decoding and significantly reduces time per output token (TPOT).

For more details, please refer to the \[official Kimi Linear GitHub Repository]: [https://github.com/MoonshotAI/Kimi-Linear](https://github.com/MoonshotAI/Kimi-Linear)

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

## 3. Model Deployment

This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and thinking capabilities.

<KimiLinearDeployment />

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)
* [SGLang OpenAI Vision API Guide](../../../docs/basic_usage/openai_api_vision)

### 4.2 Advanced Usage

#### 4.2.1 Launch the docker

```shell Command theme={null}
docker pull lmsysorg/sglang:v0.5.7-rocm700-mi30x
```

```shell Command theme={null}
docker run -d -it --ipc=host --network=host --privileged \
  --cap-add=CAP_SYS_ADMIN \
  --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
  --group-add video --cap-add=SYS_PTRACE \
  --security-opt seccomp=unconfined \
  -v /:/work \
  -e SHELL=/bin/bash \
  --name Kimi-linear \
  lmsysorg/sglang:v0.5.7-rocm700-mi30x \
  /bin/bash
```

#### 4.2.2 pre-installation steps inside the docker

```shell Command theme={null}
pip install sentencepiece tiktoken
```

#### 4.2.3 Launch the server

```shell Command theme={null}
export SGLANG_ROCM_FUSED_DECODE_MLA=0

SGLANG_ROCM_FUSED_DECODE_MLA=0 python3 -m sglang.launch_server \
  --model-path moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --tokenizer-path  moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --tp 4 \
  --trust-remote-code
```

## 5. Benchmark

### 5.1 Speed Benchmark

Test Environment:

Hardware: AMD MI300X GPU

Model: Kimi-Linear-48B-A3B-Instruct

Tensor Parallelism: 4

sglang version: 0.5.7

* **Model Deployment**

```bash Command theme={null}
SGLANG_ROCM_FUSED_DECODE_MLA=0 python3 -m sglang.launch_server \
  --model-path moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --tokenizer-path  moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --tp 4 \
  --trust-remote-code
```

### 5.1.1 Low Concurrency (Latency-Optimized)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  23.86
Total input tokens:                      6101
Total input text tokens:                 6101
Total input vision tokens:               0
Total generated tokens:                  4220
Total generated tokens (retokenized):    4001
Request throughput (req/s):              0.42
Input token throughput (tok/s):          255.70
Output token throughput (tok/s):         176.86
Peak output token throughput (tok/s):    190.00
Peak concurrent requests:                2
Total token throughput (tok/s):          432.56
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   2383.93
Median E2E Latency (ms):                 1911.63
---------------Time to First Token----------------
Mean TTFT (ms):                          141.33
Median TTFT (ms):                        126.27
P99 TTFT (ms):                           294.76
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          5.32
Median TPOT (ms):                        5.33
P99 TPOT (ms):                           5.36
---------------Inter-Token Latency----------------
Mean ITL (ms):                           5.33
Median ITL (ms):                         5.32
P95 ITL (ms):                            5.44
P99 ITL (ms):                            5.58
Max ITL (ms):                            11.46
==================================================
```

### 5.1.2 Medium Concurrency (Balanced)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  31.38
Total input tokens:                      39668
Total input text tokens:                 39668
Total input vision tokens:               0
Total generated tokens:                  40805
Total generated tokens (retokenized):    39667
Request throughput (req/s):              2.55
Input token throughput (tok/s):          1264.13
Output token throughput (tok/s):         1300.37
Peak output token throughput (tok/s):    1801.00
Peak concurrent requests:                21
Total token throughput (tok/s):          2564.50
Concurrency:                             14.13
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   5543.18
Median E2E Latency (ms):                 5755.31
---------------Time to First Token----------------
Mean TTFT (ms):                          175.25
Median TTFT (ms):                        137.87
P99 TTFT (ms):                           292.92
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          10.75
Median TPOT (ms):                        10.87
P99 TPOT (ms):                           16.74
---------------Inter-Token Latency----------------
Mean ITL (ms):                           10.54
Median ITL (ms):                         7.95
P95 ITL (ms):                            13.68
P99 ITL (ms):                            116.80
Max ITL (ms):                            299.89
==================================================

```

### 5.1.3 High Concurrency (Throughput-Optimized)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 500 \
  --max-concurrency 100 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     500
Benchmark duration (s):                  79.71
Total input tokens:                      249831
Total input text tokens:                 249831
Total input vision tokens:               0
Total generated tokens:                  252662
Total generated tokens (retokenized):    228448
Request throughput (req/s):              6.27
Input token throughput (tok/s):          3134.20
Output token throughput (tok/s):         3169.72
Peak output token throughput (tok/s):    6109.00
Peak concurrent requests:                110
Total token throughput (tok/s):          6303.92
Concurrency:                             94.80
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   15113.92
Median E2E Latency (ms):                 13851.52
---------------Time to First Token----------------
Mean TTFT (ms):                          564.46
Median TTFT (ms):                        226.04
P99 TTFT (ms):                           2683.14
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          29.63
Median TPOT (ms):                        31.28
P99 TPOT (ms):                           38.84
---------------Inter-Token Latency----------------
Mean ITL (ms):                           28.85
Median ITL (ms):                         16.29
P95 ITL (ms):                            123.42
P99 ITL (ms):                            157.80
Max ITL (ms):                            2481.11
==================================================
```

### 5.2 Accuracy Benchmark

#### 5.2.1 GSM8K Benchmark

* Server Command

```shell Command theme={null}
SGLANG_ROCM_FUSED_DECODE_MLA=0 python3 -m sglang.launch_server \
  --model-path moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --tokenizer-path  moonshotai/Kimi-Linear-48B-A3B-Instruct \
  --tp 4 \
  --trust-remote-code
```

* Benchmark Command

```shell Command theme={null}
python3 -m sglang.test.few_shot_gsm8k --num-questions 200
```

* **Result**:

```text Output theme={null}
Accuracy: 0.705
Invalid: 0.000
Latency: 11.855 s
Output throughput: 3224.982 token/s
```
