> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# MiMo-V2.5

export const MiMoV25Deployment = () => {
  const options = {
    modelVariant: {
      name: "modelVariant",
      title: "Model Variant",
      items: [{
        id: "pro",
        label: "V2.5-Pro",
        default: true,
        subtitle: "1.02T / 42B"
      }, {
        id: "base",
        label: "V2.5",
        default: false,
        subtitle: "310B / 15B"
      }]
    },
    hardware: {
      name: "hardware",
      title: "Hardware Platform",
      items: [{
        id: "h200",
        label: "H200",
        default: true
      }, {
        id: "h100",
        label: "H100",
        default: false
      }, {
        id: "b200",
        label: "B200",
        default: false
      }, {
        id: "gb300",
        label: "GB300",
        default: false
      }, {
        id: "tpu-v7x",
        label: "TPU v7x",
        default: false,
        subtitle: "sgl-jax, Pro only"
      }, {
        id: "tpu-v6e",
        label: "TPU v6e",
        default: false,
        subtitle: "sgl-jax, Pro only"
      }]
    },
    eagleMtp: {
      name: "eagleMtp",
      title: "EAGLE MTP",
      items: [{
        id: "enabled",
        label: "Enabled",
        default: true,
        subtitle: "EAGLE"
      }, {
        id: "disabled",
        label: "Disabled",
        default: false
      }]
    },
    dpAttention: {
      name: "dpAttention",
      title: "DP Attention",
      items: [{
        id: "enabled",
        label: "Enabled",
        default: false,
        subtitle: "auto for V2.5"
      }, {
        id: "disabled",
        label: "Disabled",
        default: true
      }]
    },
    expertParallelism: {
      name: "expertParallelism",
      title: "Expert Parallelism",
      items: [{
        id: "enabled",
        label: "Enabled",
        default: false,
        subtitle: "Pro Hopper"
      }, {
        id: "disabled",
        label: "Disabled",
        default: true
      }]
    },
    deepep: {
      name: "deepep",
      title: "DeepEP",
      items: [{
        id: "enabled",
        label: "Enabled",
        default: false,
        subtitle: "needs deep_ep"
      }, {
        id: "disabled",
        label: "Disabled",
        default: true,
        subtitle: "default"
      }]
    },
    reasoningParser: {
      name: "reasoningParser",
      title: "Reasoning Parser",
      items: [{
        id: "enabled",
        label: "Enabled",
        default: true,
        subtitle: "mimo"
      }, {
        id: "disabled",
        label: "Disabled",
        default: false
      }]
    },
    toolcall: {
      name: "toolcall",
      title: "Tool Call Parser",
      items: [{
        id: "enabled",
        label: "Enabled",
        default: true,
        subtitle: "mimo"
      }, {
        id: "disabled",
        label: "Disabled",
        default: false
      }]
    }
  };
  const HW_VARIANT_SPEC = {
    "pro|h200": {
      slug: "XiaomiMiMo/MiMo-V2.5-Pro",
      tp: 16,
      multinode: true,
      nnodes: 2,
      blackwell: false,
      jax: false
    },
    "pro|h100": {
      slug: "XiaomiMiMo/MiMo-V2.5-Pro",
      tp: 16,
      multinode: true,
      nnodes: 2,
      blackwell: false,
      jax: false
    },
    "pro|b200": {
      slug: "XiaomiMiMo/MiMo-V2.5-Pro",
      tp: 8,
      multinode: false,
      blackwell: true,
      jax: false
    },
    "pro|gb300": {
      slug: "XiaomiMiMo/MiMo-V2.5-Pro",
      tp: 8,
      multinode: true,
      nnodes: 2,
      blackwell: true,
      jax: false
    },
    "pro|tpu-v7x": {
      slug: "XiaomiMiMo/MiMo-V2.5-Pro",
      tp: 32,
      multinode: true,
      nnodes: 4,
      blackwell: false,
      jax: true
    },
    "pro|tpu-v6e": {
      slug: "XiaomiMiMo/MiMo-V2.5-Pro",
      tp: 64,
      multinode: true,
      nnodes: 16,
      blackwell: false,
      jax: true
    },
    "base|h200": {
      slug: "XiaomiMiMo/MiMo-V2.5",
      tp: 8,
      multinode: false,
      blackwell: false,
      jax: false,
      dp: 2
    },
    "base|h100": {
      slug: "XiaomiMiMo/MiMo-V2.5",
      tp: 8,
      multinode: false,
      blackwell: false,
      jax: false,
      dp: 2
    },
    "base|b200": {
      slug: "XiaomiMiMo/MiMo-V2.5",
      tp: 4,
      multinode: false,
      blackwell: true,
      jax: false,
      dp: 1
    },
    "base|gb300": {
      slug: "XiaomiMiMo/MiMo-V2.5",
      tp: 4,
      multinode: false,
      blackwell: true,
      jax: false,
      dp: 1
    }
  };
  const multiNodeFlags = nnodes => [`  --nnodes ${nnodes}`, `  --node-rank <node-rank>`, `  --dist-init-addr <node0-ip>:20000`];
  const prependMultiNodeNote = (cmd, nnodes) => `# Multi-node (${nnodes} nodes). Run the same command on every node with:\n` + `#   <node-rank> = 0 on the head node, 1..${nnodes - 1} on the others\n` + `#   <node0-ip>  = IP of the head node (reachable from all others)\n` + `${cmd}`;
  const computeConstraints = (variant, hardware) => {
    const isPro = variant === "pro";
    const spec = HW_VARIANT_SPEC[`${variant}|${hardware}`];
    const blackwell = spec ? spec.blackwell : false;
    const jax = spec ? spec.jax : false;
    const c = {};
    if (!isPro) {
      if (spec && spec.dp > 1) {
        c.dpAttention = {
          force: "enabled",
          reason: "V2.5 checkpoint is TP=4-interleaved; DP-attention is required (--dp = tp/4)."
        };
      } else {
        c.dpAttention = {
          force: "disabled",
          reason: "Single attention group on this hardware (tp=4, dp=1)."
        };
      }
    }
    if (blackwell) {
      c.deepep = {
        force: "disabled",
        reason: "Blackwell uses flashinfer_trtllm; DeepEP is Hopper / Ampere only."
      };
    }
    if (jax) {
      c.modelVariant = {
        force: "pro",
        reason: "sgl-jax TPU runtime only supports MiMo-V2.5-Pro today."
      };
      c.eagleMtp = {
        force: "disabled",
        reason: "EAGLE MTP is not supported on the sgl-jax TPU runtime."
      };
      c.deepep = {
        force: "disabled",
        reason: "DeepEP is a CUDA-only backend; sgl-jax uses the fused Pallas MoE kernel."
      };
      c.expertParallelism = {
        force: "enabled",
        reason: "sgl-jax TPU recipes always use EP = TP."
      };
    }
    return c;
  };
  const resolveItems = (option, constraints) => {
    const c = constraints[option.name];
    if (!c) return option.items;
    return option.items.map(item => item.id !== c.force ? {
      ...item,
      disabled: true,
      disabledReason: c.reason
    } : item);
  };
  const getInitialState = () => {
    const initialState = {};
    const constraints = computeConstraints("pro", "h200");
    for (const [key, option] of Object.entries(options)) {
      const items = resolveItems(option, constraints);
      const def = items.find(i => i.default && !i.disabled) || items.find(i => !i.disabled) || items[0];
      initialState[key] = def.id;
    }
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains("dark") || html.getAttribute("data-theme") === "dark" || html.style.colorScheme === "dark";
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ["class", "data-theme", "style"]
    });
    return () => observer.disconnect();
  }, []);
  useEffect(() => {
    const constraints = computeConstraints(values.modelVariant, values.hardware);
    let patch = null;
    for (const [key, c] of Object.entries(constraints)) {
      if (values[key] !== c.force) {
        patch = patch || ({});
        patch[key] = c.force;
      }
    }
    if (patch) setValues(prev => ({
      ...prev,
      ...patch
    }));
  }, [values.modelVariant, values.hardware]);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const generateCommand = () => {
    const {modelVariant, hardware, eagleMtp, dpAttention, expertParallelism, deepep, reasoningParser, toolcall} = values;
    const specKey = `${modelVariant}|${hardware}`;
    const spec = HW_VARIANT_SPEC[specKey];
    const {slug, tp, multinode, nnodes, blackwell, jax} = spec;
    const isPro = modelVariant === "pro";
    if (jax) {
      const isV7x = hardware === "tpu-v7x";
      const useEp = expertParallelism === "enabled";
      const useDpAttn = dpAttention === "enabled";
      const dpSize = isV7x ? 4 : 8;
      const flags = [];
      flags.push(`  --model-path ${slug}`);
      flags.push("  --trust-remote-code");
      flags.push(`  --tp-size ${tp}`);
      if (useEp) flags.push(`  --ep-size ${tp}`);
      if (useDpAttn) flags.push(`  --dp-size ${dpSize}`);
      flags.push("  --moe-backend fused");
      if (!isV7x) flags.push("  --attention-backend fa");
      flags.push("  --host 0.0.0.0");
      flags.push("  --port 30000");
      flags.push("  --page-size 256");
      flags.push("  --context-length 262144");
      flags.push("  --chunked-prefill-size 4096");
      flags.push("  --max-running-requests 512");
      if (isV7x) {
        flags.push("  --dtype bfloat16");
        flags.push("  --mem-fraction-static 0.95");
        flags.push("  --swa-full-tokens-ratio 0.25");
        flags.push("  --log-level info");
      } else {
        flags.push("  --max-seq-len 4096");
        flags.push("  --max-prefill-tokens 16384");
        flags.push("  --mem-fraction-static 0.92");
        flags.push("  --swa-full-tokens-ratio 0.15");
      }
      if (reasoningParser === "enabled") flags.push("  --reasoning-parser mimo");
      if (toolcall === "enabled") flags.push("  --tool-call-parser mimo");
      flags.push(`  --nnodes ${nnodes}`);
      flags.push("  --node-rank <node-rank>");
      flags.push("  --dist-init-addr <node0-ip>:20000");
      const cmd = `JAX_COMPILATION_CACHE_DIR=/tmp/jit_cache python -m sgl_jax.launch_server \\\n${flags.join(" \\\n")}`;
      return prependMultiNodeNote(cmd, nnodes);
    }
    const useMtp = eagleMtp === "enabled";
    const useDeepep = !blackwell && deepep === "enabled";
    const useEp = isPro && !blackwell && expertParallelism === "enabled";
    const useDpAttn = dpAttention === "enabled";
    const dpSize = !isPro ? spec.dp : tp;
    const envVars = [];
    if (isPro && blackwell && multinode) {
      envVars.push("NCCL_MNNVL_ENABLE=1", "NCCL_CUMEM_ENABLE=1");
    }
    if (useMtp) envVars.push("SGLANG_ENABLE_SPEC_V2=1");
    if (useDeepep) envVars.push("SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256");
    const flags = [];
    flags.push("  --trust-remote-code");
    flags.push(`  --model-path ${slug}`);
    flags.push(`  --tp ${tp}`);
    if (useDpAttn) {
      flags.push(`  --dp ${dpSize}`);
      flags.push("  --enable-dp-attention");
      if (!isPro) {
        flags.push("  --enable-dp-lm-head");
        flags.push("  --mm-enable-dp-encoder");
      }
    }
    if (useEp) flags.push(`  --ep ${tp}`);
    if (multinode) flags.push(...multiNodeFlags(nnodes));
    if (isPro && blackwell) {
      flags.push("  --moe-runner-backend flashinfer_trtllm");
    } else if (useDeepep) {
      flags.push("  --moe-a2a-backend deepep");
      if (!isPro) flags.push("  --deepep-mode auto");
      flags.push("  --moe-dense-tp-size 1");
    }
    if (isPro) {
      if (blackwell) {
        flags.push("  --attention-backend fa4");
        flags.push("  --mem-fraction-static 0.8");
        flags.push("  --max-running-requests 128");
        flags.push("  --chunked-prefill-size 16384");
        if (hardware === "b200") flags.push("  --swa-full-tokens-ratio 0.1");
        flags.push(`  --model-loader-extra-config '{"enable_multithread_load": "true","num_threads": 64}'`);
      } else {
        flags.push("  --mem-fraction-static 0.7");
        flags.push("  --max-running-requests 128");
        flags.push("  --chunked-prefill-size 32768");
        flags.push("  --cuda-graph-max-bs 64");
        flags.push("  --page-size 64");
        flags.push("  --swa-full-tokens-ratio 0.3");
        flags.push(`  --model-loader-extra-config '{"enable_multithread_load": "true","num_threads": 64}'`);
      }
    } else {
      flags.push("  --mem-fraction-static 0.65");
      flags.push("  --chunked-prefill-size 16384");
    }
    if (useMtp) {
      flags.push("  --speculative-algorithm EAGLE");
      flags.push("  --speculative-num-steps 3");
      flags.push("  --speculative-eagle-topk 1");
      flags.push("  --speculative-num-draft-tokens 4");
      if (!blackwell) flags.push("  --enable-multi-layer-eagle");
    }
    if (reasoningParser === "enabled") flags.push("  --reasoning-parser mimo");
    if (toolcall === "enabled") flags.push("  --tool-call-parser mimo");
    flags.push("  --host 0.0.0.0");
    flags.push("  --port 30000");
    const envInline = envVars.length ? envVars.join(" ") + " " : "";
    const base = `${envInline}sglang serve \\\n${flags.join(" \\\n")}`;
    return multinode ? prependMultiNodeNote(base, nnodes) : base;
  };
  const containerStyle = {
    maxWidth: "900px",
    margin: "0 auto",
    display: "flex",
    flexDirection: "column",
    gap: "4px"
  };
  const cardStyle = {
    padding: "8px 12px",
    border: `1px solid ${isDark ? "#374151" : "#e5e7eb"}`,
    borderLeft: `3px solid ${isDark ? "#E85D4D" : "#D45D44"}`,
    borderRadius: "4px",
    display: "flex",
    alignItems: "center",
    gap: "12px",
    background: isDark ? "#1f2937" : "#fff"
  };
  const titleStyle = {
    fontSize: "13px",
    fontWeight: "600",
    minWidth: "140px",
    flexShrink: 0,
    color: isDark ? "#e5e7eb" : "inherit"
  };
  const itemsStyle = {
    display: "flex",
    rowGap: "2px",
    columnGap: "6px",
    flexWrap: "wrap",
    alignItems: "center",
    flex: 1
  };
  const labelBaseStyle = {
    padding: "4px 10px",
    border: `1px solid ${isDark ? "#9ca3af" : "#d1d5db"}`,
    borderRadius: "3px",
    cursor: "pointer",
    display: "inline-flex",
    flexDirection: "column",
    alignItems: "center",
    justifyContent: "center",
    fontWeight: "500",
    fontSize: "13px",
    transition: "all 0.2s",
    userSelect: "none",
    minWidth: "45px",
    textAlign: "center",
    flex: 1,
    background: isDark ? "#374151" : "#fff",
    color: isDark ? "#e5e7eb" : "inherit"
  };
  const checkedStyle = {
    background: "#D45D44",
    color: "white",
    borderColor: "#D45D44"
  };
  const disabledStyle = {
    cursor: "not-allowed",
    opacity: 0.4
  };
  const subtitleStyle = {
    display: "block",
    fontSize: "9px",
    marginTop: "1px",
    lineHeight: "1.1",
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: "12px 16px",
    background: isDark ? "#111827" : "#f5f5f5",
    borderRadius: "6px",
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: "12px",
    lineHeight: "1.5",
    color: isDark ? "#e5e7eb" : "#374151",
    whiteSpace: "pre-wrap",
    overflowX: "auto",
    margin: 0,
    border: `1px solid ${isDark ? "#374151" : "#e5e7eb"}`
  };
  const constraints = computeConstraints(values.modelVariant, values.hardware);
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    const items = resolveItems(option, constraints);
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = !!item.disabled;
      return <label key={item.id} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }} title={item.disabledReason || ""}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: "none"
      }} />
                    {item.label}
                    {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? "rgba(255,255,255,0.85)" : "inherit"
      }}>
                        {item.subtitle}
                      </small>}
                  </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[MiMo-V2.5-Pro](https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro) and [MiMo-V2.5](https://huggingface.co/XiaomiMiMo/MiMo-V2.5) are next-generation Mixture-of-Experts models from the XiaomiMiMo Team.

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <colgroup>
    <col style={{width: "25%"}} />

    <col style={{width: "15%"}} />

    <col style={{width: "15%"}} />

    <col style={{width: "45%"}} />
  </colgroup>

  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.02)"}}>Variant</th>
      <th style={{textAlign: "right", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.05)"}}>Total params</th>
      <th style={{textAlign: "right", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.02)"}}>Active (MoE)</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.05)"}}>Modalities</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}><strong><a href="https://huggingface.co/XiaomiMiMo/MiMo-V2.5-Pro">MiMo-V2.5-Pro</a></strong></td>
      <td style={{padding: "9px 12px", textAlign: "right", backgroundColor: "rgba(255,255,255,0.05)"}}><strong>1.02T</strong></td>
      <td style={{padding: "9px 12px", textAlign: "right", backgroundColor: "rgba(255,255,255,0.02)"}}>42B</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Text (multimodal planned)</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}><strong><a href="https://huggingface.co/XiaomiMiMo/MiMo-V2.5">MiMo-V2.5</a></strong></td>
      <td style={{padding: "9px 12px", textAlign: "right", backgroundColor: "rgba(255,255,255,0.05)"}}><strong>310B</strong></td>
      <td style={{padding: "9px 12px", textAlign: "right", backgroundColor: "rgba(255,255,255,0.02)"}}>15B</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Text, Image, Video, Audio</td>
    </tr>
  </tbody>
</table>

**Key Features:**

* **Hybrid Attention Architecture**: Interleaves Sliding Window Attention (SWA) and Global Attention (GA) for reduced KV cache while preserving long-context capability.
* **Multi-Token Prediction (MTP)**: 3-layer MTP module accelerates decoding. Both variants support EAGLE speculative decoding with MTP weights.
* **1M-Token Context**: Both variants support up to 1 million token context windows.
* **Agentic Capabilities**: Post-training with large-scale agentic RL achieves strong performance on coding, reasoning, and tool-use benchmarks.
* **MiMo-V2.5 Multimodal** (V2.5 only): Native omnimodal architecture with a 729M-param ViT Vision Encoder (28 layers: 24 SWA + 4 Full) and a 261M-param Audio Transformer (24 layers: 12 SWA + 12 Full); supports image, video, and audio understanding via standard OpenAI-compatible multimodal API.

**License:** Apache 2.0

## 2. SGLang Installation

Refer to the [official SGLang installation guide](../../../docs/get-started/install).

**Docker Images by Variant × Hardware:**

| Variant                   | Hardware                            | Docker Image                             |
| ------------------------- | ----------------------------------- | ---------------------------------------- |
| **MiMo-V2.5 (310B)**      | H100 / H200 (Hopper, CUDA 12.9)     | `lmsysorg/sglang:dev-mimo-v2.5`          |
| **MiMo-V2.5 (310B)**      | B200 / GB300 (Blackwell, CUDA 13.0) | `lmsysorg/sglang:dev-cu13-mimo-v2.5`     |
| **MiMo-V2.5-Pro (1.02T)** | H100 / H200 (Hopper, CUDA 12.9)     | `lmsysorg/sglang:dev-mimo-v2.5-pro`      |
| **MiMo-V2.5-Pro (1.02T)** | B200 / GB300 (Blackwell, CUDA 13.0) | `lmsysorg/sglang:dev-cu13-mimo-v2.5-pro` |

> Pull the image matching your GPU's CUDA driver. `lmsysorg/sglang:latest` will not load either checkpoint.

**TPU (sgl-jax):** MiMo-V2.5-Pro can also be served on TPU via the JAX-based [sgl-jax](https://github.com/sgl-project/sglang-jax) runtime. The container image and `pip install` steps are listed in [§3.3 TPU Deployment](#33-tpu-deployment-mimo-v25-pro-sgl-jax).

## 3. Model Deployment

### 3.1 Basic Configuration

Use the selector below to generate the deployment command for your variant and hardware.

<MiMoV25Deployment />

### 3.2 Configuration Tips

**MiMo-V2.5-Pro (1.02T):**

* **B200**: single node, TP=8 (verified). Uses `--attention-backend fa4` + `--moe-runner-backend flashinfer_trtllm` + `--mem-fraction-static 0.8`. Set `--swa-full-tokens-ratio 0.1` to keep KV-cache footprint within 192 GB HBM.
* **GB300**: 2 nodes, TP=8 (verified). Same Blackwell stack as B200; multi-node interconnect requires `NCCL_MNNVL_ENABLE=1 NCCL_CUMEM_ENABLE=1`. Default SWA ratio is fine.
* **H100/H200**: 2 nodes × 8 GPUs (TP=16, not yet verified). Uses the Hopper stack (`fa3` + DeepEP + EAGLE multi-layer); fits with `--mem-fraction-static 0.7` and `--swa-full-tokens-ratio 0.3`. DeepEP dispatch tuning: `SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256` avoids memory spikes during prefill.
* EAGLE speculative decoding (3 steps, topk=1) typically yields a 2–3× decode speedup. Requires `SGLANG_ENABLE_SPEC_V2=1`; on Hopper also pass `--enable-multi-layer-eagle`.

**MiMo-V2.5 (310B):**

* The checkpoint has a TP=4-interleaved fused `qkv_proj`; attention-TP per DP group **must** be 4. Use `--dp = TP / 4`; for TP > 4 this also requires DP-attention. Total GPUs must be a multiple of 4. A bare `--tp 8` without `--dp 2` will fail to load with `MiMoV2 fused qkv_proj checkpoint is TP=4-interleaved; got attention tp_size=8`.
* Single-node deployments: H100/H200 8× GPUs (`--tp 8 --dp 2`), B200 4× GPUs (`--tp 4`, dp=1, no DP-attn flag needed), GB300 4× GPUs (`--tp 4`, single NVL4 node). FP8 quantization.
* `--enable-dp-lm-head` and `--mm-enable-dp-encoder` are required whenever `--enable-dp-attention` is on, to keep LM head and encoder sharding consistent.
* EAGLE MTP uses the checkpoint's MTP weights. For H100/H200, enable `SGLANG_ENABLE_SPEC_V2=1`, `--speculative-algorithm EAGLE`, and `--enable-multi-layer-eagle`.
* **Multimodal**: Supports image, video, and audio understanding; see Section 4.3 for invocation examples.

**DeepEP (optional toggle, Hopper-only):**

* DeepEP replaces the default MoE all-to-all dispatch with a fused [DeepEP](https://github.com/deepseek-ai/DeepEP) backend; it lowers expert dispatch latency and memory traffic, so it pays off under **high concurrency / throughput-bound workloads** on H100/H200. Under concurrency=1 / latency-bound workloads the gain is negligible — leave it off.
* Enabling adds `--moe-a2a-backend deepep` + `--moe-dense-tp-size 1` (and `--ep <tp>` for Pro) plus `SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256` env to cap the dispatch buffer. Requires `pip install deep_ep` (not part of the default sglang install).
* On Blackwell (B200, GB300) the verified MoE backend is `flashinfer_trtllm`; the DeepEP toggle is a no-op there.

### 3.3 TPU Deployment (MiMo-V2.5-Pro, sgl-jax)

MiMo-V2.5-Pro can also be served on TPU via [sgl-jax](https://github.com/sgl-project/sglang-jax). The runtime is a separate JAX-based stack (`sgl_jax.launch_server`); pick **TPU v7x** or **TPU v6e** in the panel above to generate the launch command. Verified topologies:

| TPU Type | Topology | Chips/Node | Nodes | Total Chips | JAX Devices/Chip | Total JAX Devices (= `--tp-size`) |
| -------- | -------- | ---------- | ----- | ----------- | ---------------- | --------------------------------- |
| **v7x**  | 2×2×4    | 4          | 4     | 16          | 2                | 32                                |
| **v6e**  | 4×4×4    | 4          | 16    | 64          | 1                | 64                                |

> v7x exposes **2 logical JAX devices per chip**, so `--tp-size = 16 chips × 2 = 32`. v6e exposes 1 device per chip, so `--tp-size = 64`. Always set `--tp-size` to the total JAX device count across all nodes, not the chip count.

All nodes must sit in the same TPU slice and reach each other on the JAX init port (`20000`) and the TPU process port (`8471`).

**Step 1 — Launch the JAX TPU container on every node:**

```shell Command theme={null}
docker run -it --privileged \
  --shm-size=32g \
  --ipc=host \
  --network=host \
  -v /dev:/dev \
  us-docker.pkg.dev/cloud-tpu-images/jax-ai-image/tpu:jax0.8.1-rev1 bash
```

> The image is pinned to `jax0.8.1-rev1` to keep the JAX runtime aligned with sgl-jax's TPU extras.

**Step 2 — Clone and install sgl-jax (inside the container):**

```shell Command theme={null}
git clone https://github.com/sgl-project/sglang-jax.git
cd sglang-jax
pip install -e "python[tpu]"
```

## 4. Model Invocation

### 4.1 Basic Usage

See [Basic API Usage](../../../docs/basic_usage/send_request).

### 4.2 Reasoning Output

Both variants support hybrid thinking mode. Thinking content is separated via the reasoning parser.

**Thinking Mode (default):**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

response = client.chat.completions.create(
    model="XiaomiMiMo/MiMo-V2.5",
    messages=[
        {"role": "user", "content": "Which is larger, 9.11 or 9.9? Think carefully."}
    ]
)

print("====== Reasoning ======")
print(response.choices[0].message.reasoning_content)
print("====== Answer ======")
print(response.choices[0].message.content)
```

**Output Example (MiMo-V2.5):**

```text theme={null}
====== Reasoning ======
Comparing 9.11 and 9.9.

The integer parts are both 9. Now compare the decimal parts: 0.11 vs 0.9.

0.9 = 0.90, which is greater than 0.11.

So 9.9 > 9.11.
====== Answer ======
**9.9 is larger than 9.11.**

Here's the reasoning: When comparing decimals, line them up to the same number of decimal places:

- 9.11
- 9.90

Both have a **9** in the ones place, but in the tenths place, **9 > 1**, so 9.90 > 0.11.

**9.9 > 9.11**
```

**Thinking Off (instant mode):**

```python Example theme={null}
response = client.chat.completions.create(
    model="XiaomiMiMo/MiMo-V2.5",
    messages=[
        {"role": "user", "content": "Which is larger, 9.11 or 9.9? Think carefully."}
    ],
    extra_body={"chat_template_kwargs": {"thinking": False}}
)

print(response.choices[0].message.content)
```

**Output Example (MiMo-V2.5):**

```text theme={null}
## Comparing 9.11 and 9.9

**9.9 is larger.**

The key is to compare them place by place. It helps to write them with the same number of decimal places:

- **9.11** → 9.11
- **9.9** → 9.90

Both have **9** in the ones place, but in the tenths place: **9** (in 9.90) is greater than **1** (in 9.11).

So **9.90 > 9.11**.
```

### 4.3 Multimodal Invocation (V2.5 only)

**Image Understanding:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

response = client.chat.completions.create(
    model="XiaomiMiMo/MiMo-V2.5",
    messages=[{
        "role": "user",
        "content": [
            {"type": "image_url", "image_url": {"url": "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/images/man_ironing_on_back_of_suv.png"}},
            {"type": "text", "text": "Describe this image in detail."}
        ]
    }]
)

print(response.choices[0].message.content)
```

**Output Example:**

```text theme={null}
Based on the image provided, here is a detailed description:

The image captures a whimsical or surreal scene set on a busy city street, likely in New York City given the iconic yellow cabs. In the center foreground, a man is sitting on a folding chair, casually crossing his legs. He is wearing a bright yellow hoodie with a graphic on the front and blue jeans. He is intently focused on ironing a white dress shirt that rests on an ironing board set up directly on the asphalt.

Behind him, a yellow SUV taxi cab is stopped or moving slowly, angled slightly away from the camera. To his left, another yellow taxi sedan is captured in motion blur, indicating it is driving past him. The background features tall city buildings with glass windows and storefronts. There are banners hanging from streetlights, and some greenery is visible in the distance. The overall impression is one of incongruity—performing a domestic chore like ironing in the middle of a chaotic urban environment.
```

**Video Understanding:**

```python Example theme={null}
response = client.chat.completions.create(
    model="XiaomiMiMo/MiMo-V2.5",
    messages=[{
        "role": "user",
        "content": [
            {"type": "video_url", "video_url": {"url": "https://videos.pexels.com/video-files/4114797/4114797-uhd_3840_2160_25fps.mp4"}},
            {"type": "text", "text": "Summarize what happens in this video."}
        ]
    }]
)

print(response.choices[0].message.content)
```

**Output Example:**

```text theme={null}
A person wearing blue protective gloves is shown operating a microscope in a close-up shot. The individual is adjusting a knob on the side of the microscope, which moves the stage holding a glass slide, likely focusing the lens on the specimen.
```

> Video decoding requires `decord` (`pip install decord`); SGLang's MiMo-V2.5 multimodal processor uses `decord.VideoReader` for frame extraction.

**Audio Understanding:**

```python Example theme={null}
response = client.chat.completions.create(
    model="XiaomiMiMo/MiMo-V2.5",
    messages=[{
        "role": "user",
        "content": [
            {"type": "audio_url", "audio_url": {"url": "https://raw.githubusercontent.com/sgl-project/sgl-test-files/refs/heads/main/audios/Trump_WEF_2018_10s.mp3"}},
            {"type": "text", "text": "Transcribe and summarize this audio."}
        ]
    }]
)

print(response.choices[0].message.content)
```

**Output Example:**

```text theme={null}
**Transcript:**
"Thank you Klaus very much. It's a privilege to be here at this forum where leaders in business, science, art, diplomacy and world affairs have gathered for..."

**Summary:**
The speaker thanks Klaus for the introduction and expresses their honor at attending a forum. They highlight that the event has brought together high-level leaders from various sectors, including business, science, art, and diplomacy.
```

### 4.4 Tool Calling

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "City name"},
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                },
                "required": ["location"]
            }
        }
    }
]

response = client.chat.completions.create(
    model="XiaomiMiMo/MiMo-V2.5",
    messages=[{"role": "user", "content": "What's the weather in Beijing?"}],
    tools=tools
)

msg = response.choices[0].message
if msg.reasoning_content:
    print("=== Reasoning ===")
    print(msg.reasoning_content)
if msg.tool_calls:
    print("=== Tool Calls ===")
    for tc in msg.tool_calls:
        print(f"  Function: {tc.function.name}")
        print(f"  Arguments: {tc.function.arguments}")
```

**Output Example (MiMo-V2.5):**

```text theme={null}
=== Reasoning ===
The user wants to know the weather in Beijing. I have a function available called "get_weather" that can retrieve current weather for a location. Let me call that function with Beijing as the location.
=== Tool Calls ===
  Function: get_weather
  Arguments: {"location": "Beijing"}
```

## 5. Benchmark

Accuracy numbers come from `sglang.test.run_eval` (GSM8K standard 5-shot, MMMU validation split). Speed numbers come from `sglang.bench_serving` with generated random prompts; text runs use 1024 input tokens and 1024 output tokens per request, and the image run uses 2 random 720p images per request.

### 5.1 Accuracy Benchmark

#### 5.1.1 GSM8K

Standard 5-shot, `temperature=0`, `max_tokens=4096`, model defaults to thinking-on (responses contain `<think>...</think>` and the eval extracts the trailing number via regex). Server launch: see [Section 3](#3-model-deployment).

**Benchmark Command:**

```shell Command theme={null}
python3 -m sglang.test.run_eval \
  --base-url http://127.0.0.1:30000 \
  --model XiaomiMiMo/MiMo-V2.5 \
  --eval-name gsm8k \
  --num-examples 200 \
  --num-threads 8 \
  --max-tokens 4096 \
  --temperature 0.0
```

> `run_eval.py` automatically appends `/v1` to `--base-url`; pass the bare `host:port` URL (without trailing `/v1`), otherwise requests resolve to `/v1/v1/chat/completions` and 404.

* **Test Results:**
  * MiMo-V2.5-Pro (FP8)
    ```
    Pending update
    ```
  * MiMo-V2.5 (FP8, 8× H200)
    ```
    Score:             0.980  (196 / 200)
    Latency:           477.52 s
    Output throughput: 88.9 tok/s
    ```

#### 5.1.2 MMMU (V2.5 only)

`MMMU/MMMU` validation split (multi-discipline multimodal), `concurrency=16`, default sampling.

* **Benchmark Command:**

```shell Command theme={null}
python3 benchmark/mmmu/bench_sglang.py \
  --port 30000 \
  --model XiaomiMiMo/MiMo-V2.5 \
  --concurrency 16
```

* **Test Results:**
  * MiMo-V2.5 (FP8)
    ```
    Pending update
    ```

### 5.2 Speed Benchmark — MiMo-V2.5-Pro

**Test Environment:**

* Hardware: NVIDIA B200 GPU (8×)
* Model: `XiaomiMiMo/MiMo-V2.5-Pro` (FP8)
* Tensor Parallelism: 8
* Recipe: Balanced (DP-attn + DeepEP + EAGLE MTP)
* sglang version: Pending update

#### 5.2.1 Latency-Sensitive Benchmark

* **Model Deployment Command:** see the [command panel above](#3-model-deployment).
* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 30000 \
  --model XiaomiMiMo/MiMo-V2.5-Pro \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 10 \
  --max-concurrency 1
```

* **Test Results:**

```text Output theme={null}
Pending update — replace with real bench_serving output after the latency run.
```

#### 5.2.2 Throughput-Sensitive Benchmark

* **Model Deployment Command:** see the [command panel above](#3-model-deployment).
* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 30000 \
  --model XiaomiMiMo/MiMo-V2.5-Pro \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 1000 \
  --max-concurrency 100
```

* **Test Results:**

```text Output theme={null}
Pending update — replace with real bench_serving output after the throughput run.
```

### 5.3 Speed Benchmark — MiMo-V2.5

**Test Environment:**

* Hardware: NVIDIA H200 GPU (8×)
* Model: `XiaomiMiMo/MiMo-V2.5` (FP8)
* Tensor Parallelism: 8 (DP-attention with `--dp 2`)
* Recipe: Balanced (DP-attn + EAGLE MTP)
* sglang version: `0.0.0.dev1+g7d99af439` (`lmsysorg/sglang:dev-mimo-v2.5`)

#### 5.3.1 Latency-Sensitive Benchmark

* **Model Deployment Command:** select MiMo-V2.5, H200, and EAGLE MTP in the [command panel above](#3-model-deployment).
* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 30000 \
  --model XiaomiMiMo/MiMo-V2.5 \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 10 \
  --max-concurrency 1
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  14.72
Total input tokens:                      1997
Total input text tokens:                 1997
Total generated tokens:                  2798
Total generated tokens (retokenized):    2697
Request throughput (req/s):              0.68
Input token throughput (tok/s):          135.67
Output token throughput (tok/s):         190.09
Peak output token throughput (tok/s):    245.00
Peak concurrent requests:                3
Total token throughput (tok/s):          325.77
Concurrency:                             1.00
Accept length:                           3.08
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   1469.98
Median E2E Latency (ms):                 1652.84
P90 E2E Latency (ms):                    2210.80
P99 E2E Latency (ms):                    2823.86
---------------Time to First Token----------------
Mean TTFT (ms):                          143.89
Median TTFT (ms):                        99.25
P99 TTFT (ms):                           481.01
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          4.87
Median TPOT (ms):                        4.30
P99 TPOT (ms):                           6.64
---------------Inter-Token Latency----------------
Mean ITL (ms):                           4.76
Median ITL (ms):                         3.46
P95 ITL (ms):                            13.52
P99 ITL (ms):                            13.84
Max ITL (ms):                            74.37
==================================================
```

#### 5.3.2 Throughput-Sensitive Benchmark

* **Model Deployment Command:** select MiMo-V2.5, H200, and EAGLE MTP in the [command panel above](#3-model-deployment).
* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 30000 \
  --model XiaomiMiMo/MiMo-V2.5 \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 1000 \
  --max-concurrency 100
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     1000
Benchmark duration (s):                  93.41
Total input tokens:                      302118
Total input text tokens:                 302118
Total generated tokens:                  195775
Total generated tokens (retokenized):    188139
Request throughput (req/s):              10.71
Input token throughput (tok/s):          3234.48
Output token throughput (tok/s):         2095.97
Peak output token throughput (tok/s):    3019.00
Peak concurrent requests:                121
Total token throughput (tok/s):          5330.45
Concurrency:                             91.04
Accept length:                           2.95
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   8503.45
Median E2E Latency (ms):                 7491.96
P90 E2E Latency (ms):                    13706.99
P99 E2E Latency (ms):                    20474.33
---------------Time to First Token----------------
Mean TTFT (ms):                          4399.20
Median TTFT (ms):                        4333.35
P99 TTFT (ms):                           8004.81
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          58.23
Median TPOT (ms):                        21.78
P99 TPOT (ms):                           747.79
---------------Inter-Token Latency----------------
Mean ITL (ms):                           20.06
Median ITL (ms):                         15.28
P95 ITL (ms):                            48.36
P99 ITL (ms):                            96.99
Max ITL (ms):                            969.61
==================================================
```

#### 5.3.3 Multimodal (Image) Benchmark

* **Model Deployment Command:** select MiMo-V2.5, H200, and EAGLE MTP in the [command panel above](#3-model-deployment).
* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang-oai-chat \
  --host 127.0.0.1 \
  --port 30000 \
  --model XiaomiMiMo/MiMo-V2.5 \
  --dataset-name image \
  --image-count 2 \
  --image-resolution 720p \
  --random-input-len 128 \
  --random-output-len 1024 \
  --num-prompts 10 \
  --max-concurrency 1
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang-oai-chat
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  25.73
Total input tokens:                      661
Total input text tokens:                 631
Total input vision tokens:               30
Total generated tokens:                  4220
Total generated tokens (retokenized):    0
Request throughput (req/s):              0.39
Input token throughput (tok/s):          25.69
Output token throughput (tok/s):         164.03
Peak output token throughput (tok/s):    1.00
Peak concurrent requests:                2
Total token throughput (tok/s):          189.73
Concurrency:                             1.00
Accept length:                           2.94
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   2570.74
Median E2E Latency (ms):                 2411.92
P90 E2E Latency (ms):                    3711.62
P99 E2E Latency (ms):                    4949.74
---------------Time to First Token----------------
Mean TTFT (ms):                          0.00
Median TTFT (ms):                        0.00
P99 TTFT (ms):                           0.00
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          7.31
Median TPOT (ms):                        6.17
P99 TPOT (ms):                           17.18
---------------Inter-Token Latency----------------
Mean ITL (ms):                           0.00
Median ITL (ms):                         0.00
P95 ITL (ms):                            0.00
P99 ITL (ms):                            0.00
Max ITL (ms):                            0.00
==================================================
```
