> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# MOVA

export const MOVADeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'b200',
        label: 'B200',
        default: true
      }, {
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'h100',
        label: 'H100',
        default: false
      }, {
        id: 'a100',
        label: 'A100',
        default: false
      }]
    },
    resolution: {
      name: 'resolution',
      title: 'Resolution',
      items: [{
        id: '360p',
        label: '360p',
        subtitle: 'Fast inference, lower VRAM',
        default: true
      }, {
        id: '720p',
        label: '720p',
        subtitle: 'Higher resolution',
        default: false
      }]
    }
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      const defaultItem = option.items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const generateCommand = () => {
    const {resolution} = values;
    const modelPath = resolution === '720p' ? 'OpenMOSS-Team/MOVA-720p' : 'OpenMOSS-Team/MOVA-360p';
    return `export SG_OUTPUT_DIR=/root/output_mova
mkdir -p "$SG_OUTPUT_DIR"

sglang serve \\
  --model-path ${modelPath} \\
  --host 0.0.0.0 \\
  --port 30002 \\
  --adjust-frames false \\
  --num-gpus 8 \\
  --ring-degree 2 \\
  --ulysses-degree 4 \\
  --tp 1 \\
  --enable-torch-compile \\
  --save-output \\
  --output-dir "$SG_OUTPUT_DIR"`;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.items.map(item => {
    const isChecked = values[option.name] === item.id;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {}
    }}>
                  <input type="radio" name={option.name} value={item.id} checked={isChecked} onChange={() => handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                  {item.label}
                  {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[MOVA](https://github.com/OpenMOSS/MOVA) (MOSS Video and Audio) is a foundation model developed by the SII-OpenMOSS Team, designed to break the "silent era" of open-source video generation. Unlike cascaded pipelines that generate sound as an afterthought, MOVA synthesizes video and audio simultaneously in a single inference pass for perfect alignment. It adopts an Asymmetric Dual-Tower Architecture, fusing pre-trained video and audio towers through a bidirectional cross-attention mechanism to maintain tight synchronization between video and audio during generation.

[MOVA-360p](https://huggingface.co/OpenMOSS-Team/MOVA-360p) is suitable for fast inference and resource-constrained environments. [MOVA-720p](https://huggingface.co/OpenMOSS-Team/MOVA-720p) provides higher resolution video generation. Both versions support generating up to 8 seconds of video-audio content.

**Key Features:**

* **Native Bimodal Generation**: Generates high-fidelity video and synchronized audio in a single inference pass, eliminating error accumulation from cascaded pipelines
* **Precise Lip-Sync**: Achieves state-of-the-art performance in multilingual lip-synchronization (LSE-D: 7.094, LSE-C: 7.452 with Dual CFG on Verse-Bench Set3)
* **Environment-Aware Sound Effects**: Generates corresponding environmental sound effects including physical interaction sounds, ambient sounds, and spatial/textural sound feedback
* **Fully Open-Source**: Model weights, inference code, training pipelines, and LoRA fine-tuning scripts are all open-sourced

For more details, please refer to the [MOVA-360p HuggingFace page](https://huggingface.co/OpenMOSS-Team/MOVA-360p), the [MOVA-720p HuggingFace page](https://huggingface.co/OpenMOSS-Team/MOVA-720p), the [GitHub repository](https://github.com/OpenMOSS/MOVA), and the [technical report (arXiv)](https://arxiv.org/abs/2602.08794).

## 2. SGLang-diffusion Installation

SGLang-diffusion offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang-diffusion installation guide](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/docs/install.md) for installation instructions.

## 3. Model Deployment

This section provides deployment configurations optimized for different hardware platforms and use cases.

### 3.1 Basic Configuration

MOVA supports both online serving and CLI generation modes. The recommended launch configurations vary by hardware and resolution.

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform.

<MOVADeployment />

### 3.2 Configuration Tips

Current supported optimization all listed [here](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/docs/support_matrix.md).

* `--num-gpus`: Number of GPUs to use
* `--tp`: Tensor parallelism size (should not be larger than 1 if text encoder offload is enabled, as layer-wise offload plus prefetch is faster)
* `--ring-degree`: The degree of ring attention-style SP in USP
* `--ulysses-degree`: The degree of DeepSpeed-Ulysses-style SP in USP
* `--adjust-frames`: Whether to adjust frames automatically (set to `false` for MOVA)
* `--enable-torch-compile`: Enable torch.compile for faster inference

## 4. API Usage

For complete API documentation, please refer to the [official API usage guide](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/docs/openai_api.md).

### 4.1 CLI Generation (sglang generate)

```bash Command theme={null}
sglang generate \
  --model-path OpenMOSS-Team/MOVA-720p \
  --prompt "A man in a blue blazer and glasses speaks in a formal indoor setting, \
  framed by wooden furniture and a filled bookshelf. \
  Quiet room acoustics underscore his measured tone as he delivers his remarks. \
  At one point, he says, \"I would also believe that this advance in AI recently wasn't unexpected.\"" \
  --image-path "<YOUR-IMAGE-PATH>" \
  --adjust-frames false \
  --num-gpus 8 \
  --ring-degree 2 \
  --ulysses-degree 4 \
  --num-frames 193 \
  --fps 24 \
  --seed 67 \
  --num-inference-steps 25 \
  --enable-torch-compile \
  --save-output
```

### 4.2 Generate a Video

```bash Command theme={null}
curl -X POST "http://0.0.0.0:30002/v1/videos" \
  -F "prompt=A man in a blue blazer and glasses speaks in a formal indoor setting, framed by wooden furniture and a filled bookshelf. Quiet room acoustics underscore his measured tone as he delivers his remarks. At one point, he says, \"I would also believe that this advance in AI recently wasn't unexpected.\"" \
  -F "input_reference=@<YOUR-IMAGE-PATH>" \
  -F "size=640x352" \
  -F "num_frames=193" \
  -F "fps=24" \
  -F "seed=67" \
  -F "guidance_scale=5.0" \
  -F "num_inference_steps=25" \
  -o create_video.json
```

### 4.3 Advanced Usage

#### 4.3.1 Cache-DiT Acceleration

SGLang integrates [Cache-DiT](https://github.com/vipshop/cache-dit), a caching acceleration engine for Diffusion Transformers (DiT), to achieve up to 7.4x inference speedup with minimal quality loss. You can set `SGLANG_CACHE_DIT_ENABLED=True` to enable it. For more details, please refer to the SGLang Cache-DiT [documentation](https://github.com/sgl-project/sglang/blob/main/python/sglang/multimodal_gen/docs/cache_dit.md).

**Basic Usage**

```bash Command theme={null}
SGLANG_CACHE_DIT_ENABLED=true sglang serve --model-path OpenMOSS-Team/MOVA-720p
```

**Advanced Usage**

* DBCache Parameters: DBCache controls block-level caching behavior:

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Parameter</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Env Variable</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Default</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Description</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>Fn</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`SGLANG_CACHE_DIT_FN`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>1</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Number of first blocks to always compute</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>Bn</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`SGLANG_CACHE_DIT_BN`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>0</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Number of last blocks to always compute</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>W</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`SGLANG_CACHE_DIT_WARMUP`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>4</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Warmup steps before caching starts</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>R</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`SGLANG_CACHE_DIT_RDT`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>0.24</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Residual difference threshold</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>MC</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`SGLANG_CACHE_DIT_MC`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>3</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Maximum continuous cached steps</td>
    </tr>
  </tbody>
</table>

* TaylorSeer Configuration: TaylorSeer improves caching accuracy using Taylor expansion:

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Parameter</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Env Variable</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Default</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Description</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>Enable</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`SGLANG_CACHE_DIT_TAYLORSEER`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>false</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Enable TaylorSeer calibrator</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>Order</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`SGLANG_CACHE_DIT_TS_ORDER`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>1</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Taylor expansion order (1 or 2)</td>
    </tr>
  </tbody>
</table>

Combined Configuration Example:

```bash Command theme={null}
SGLANG_CACHE_DIT_ENABLED=true \
SGLANG_CACHE_DIT_FN=2 \
SGLANG_CACHE_DIT_BN=1 \
SGLANG_CACHE_DIT_WARMUP=4 \
SGLANG_CACHE_DIT_RDT=0.4 \
SGLANG_CACHE_DIT_MC=4 \
SGLANG_CACHE_DIT_TAYLORSEER=true \
SGLANG_CACHE_DIT_TS_ORDER=2 \
sglang serve --model-path OpenMOSS-Team/MOVA-720p
```

#### 4.3.2 CPU Offload

* `--dit-cpu-offload`: Use CPU offload for DiT inference. Enable if run out of memory.
* `--text-encoder-cpu-offload`: Use CPU offload for text encoder inference.
* `--vae-cpu-offload`: Use CPU offload for VAE.
* `--pin-cpu-memory`: Pin memory for CPU offload. Only added as a temp workaround if it throws "CUDA error: invalid argument".

## 5. Benchmark

### 5.1 Speedup Benchmark

#### 5.1.1 Generate a video

Test Environment:

* Hardware: NVIDIA H200 x 8
* git revision: 443b1a8
* Model: OpenMOSS-Team/MOVA-720p

**Server Command**:

```bash Command theme={null}
sglang serve --model-path OpenMOSS-Team/MOVA-720p --port 30002 \
  --adjust-frames false --num-gpus 8 --ring-degree 2 --ulysses-degree 4 \
  --tp 1 --enable-torch-compile
```

**Benchmark Command**:

```bash Command theme={null}
python3 -m sglang.multimodal_gen.benchmarks.bench_serving \
    --task image-to-video --dataset vbench --num-prompts 1 --max-concurrency 1 \
    --port 30002
```

**Result**:

```text Output theme={null}
================= Serving Benchmark Result =================
Task:                                    image-to-video
Model:                                   OpenMOSS-Team/MOVA-720p
Dataset:                                 vbench
--------------------------------------------------
Benchmark duration (s):                  590.76
Request rate:                            inf
Max request concurrency:                 1
Successful requests:                     1/1
--------------------------------------------------
Request throughput (req/s):              0.00
Latency Mean (s):                        590.7549
Latency Median (s):                      590.7549
Latency P99 (s):                         590.7549
--------------------------------------------------
Peak Memory Max (MB):                    74996.00
Peak Memory Mean (MB):                   74996.00
Peak Memory Median (MB):                 74996.00
============================================================
```

#### 5.1.2 Generate videos with high concurrency

**Server Command**:

```bash Command theme={null}
sglang serve --model-path OpenMOSS-Team/MOVA-720p --port 30002 \
  --adjust-frames false --num-gpus 8 --ring-degree 2 --ulysses-degree 4 \
  --tp 1 --enable-torch-compile
```

**Benchmark Command**:

```bash Command theme={null}
python3 -m sglang.multimodal_gen.benchmarks.bench_serving \
    --task image-to-video --dataset vbench --num-prompts 20 --max-concurrency 20 \
    --port 30002
```
