> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Nemotron 3 Nano Omni

export const Nemotron3NanoOmniDeployment = () => {
  const MODEL_PATHS = {
    reasoning: 'nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning',
    bf16: 'nvidia/Nemotron-3-Nano-Omni-30B-A3B-BF16',
    fp8: 'nvidia/Nemotron-3-Nano-Omni-30B-A3B-FP8',
    nvfp4: 'nvidia/Nemotron-3-Nano-Omni-30B-A3B-NVFP4'
  };
  const options = {
    model: {
      name: 'model',
      title: 'Model',
      items: [{
        id: 'reasoning',
        label: 'Reasoning',
        default: true
      }, {
        id: 'bf16',
        label: 'BF16',
        default: false
      }, {
        id: 'fp8',
        label: 'FP8',
        default: false
      }, {
        id: 'nvfp4',
        label: 'NVFP4',
        default: false
      }]
    },
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h100',
        label: 'H100',
        default: true
      }, {
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'b200',
        label: 'B200',
        default: false
      }, {
        id: 'a100',
        label: 'A100',
        default: false
      }, {
        id: 'l40s',
        label: 'L40S',
        default: false
      }]
    },
    tp: {
      name: 'tp',
      title: 'Tensor Parallel (TP)',
      items: [{
        id: '1',
        label: 'TP=1',
        default: false
      }, {
        id: '2',
        label: 'TP=2',
        default: false
      }, {
        id: '4',
        label: 'TP=4',
        default: true
      }, {
        id: '8',
        label: 'TP=8',
        default: false
      }]
    },
    kvcache: {
      name: 'kvcache',
      title: 'KV Cache DType',
      items: [{
        id: 'none',
        label: 'None',
        default: true
      }, {
        id: 'fp8_e4m3',
        label: 'fp8_e4m3',
        default: false
      }]
    },
    thinking: {
      name: 'thinking',
      title: 'Reasoning Parser',
      items: [{
        id: 'thinking_on',
        label: 'Enabled',
        default: true
      }, {
        id: 'thinking_off',
        label: 'Disabled',
        default: false
      }],
      commandRule: value => value === 'thinking_on' ? '--reasoning-parser deepseek-r1' : null
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'toolcall_on',
        label: 'Enabled',
        default: true
      }, {
        id: 'toolcall_off',
        label: 'Disabled',
        default: false
      }],
      commandRule: value => value === 'toolcall_on' ? '--tool-call-parser qwen3_coder' : null
    }
  };
  const generateCommand = values => {
    const {tp, kvcache, model, hardware} = values;
    if (model === 'nvfp4' && hardware !== 'b200') {
      return '# NVFP4 requires Blackwell hardware. Please select B200.';
    }
    if (hardware === 'l40s' && tp === '1') {
      return '# TP=1 is not supported on L40S for this model. Please use TP=2 or higher.';
    }
    const modelPath = MODEL_PATHS[model] || MODEL_PATHS.reasoning;
    let cmd = 'sglang serve \\\n';
    cmd += `  --model-path ${modelPath} \\\n`;
    cmd += '  --host 0.0.0.0 \\\n';
    cmd += '  --port 30000 \\\n';
    cmd += '  --trust-remote-code \\\n';
    cmd += `  --tp ${tp} \\\n`;
    if (kvcache && kvcache !== 'none') {
      cmd += `  --kv-cache-dtype ${kvcache} \\\n`;
    }
    for (const [key, option] of Object.entries(options)) {
      if (option.commandRule) {
        const rule = option.commandRule(values[key]);
        if (rule) {
          cmd += `  ${rule} \\\n`;
        }
      }
    }
    cmd = cmd.trimEnd();
    if (cmd.endsWith('\\')) {
      cmd = cmd.slice(0, -1).trimEnd();
    }
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      const items = option.items || [];
      const defaultItem = items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items[0]?.id || '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    const items = option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                    {item.label}
                  </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

`NVIDIA Nemotron 3 Nano Omni` is a 30B-parameter hybrid MoE multimodal model that activates only 3B parameters per forward pass, combining vision and audio encoders into a unified architecture. Part of the Nemotron 3 family, it is designed to power multimodal sub-agents that perceive and reason across vision, audio, and language in a single inference loop — eliminating the fragmented stacks of separate models for each modality.

Architecture and key features:

* **Hybrid Transformer-Mamba Architecture (MoE):** Combines Mixture of Experts with a hybrid Transformer-Mamba architecture for efficient routing and sequence modeling.
* **30B total / 3B active parameters:** Delivers strong multimodal accuracy at a fraction of the cost of dense models.
* **1M token context window:** Sustains coherent agent state across extended multimodal workflows — screen history, document content, and audio context remain in view without re-ingestion.
* **Unified vision and audio encoders:** One model replaces fragmented multimodal stacks; vision and audio perception happen in the same forward pass.
* **3D Convolution (Conv3D):** Efficient temporal-spatial processing for video inputs.
* **Efficient Video Sampling (EVS):** Enables longer video processing at the same compute budget via temporal-aware perception and adaptive frame sampling.
* **FP8 and NVFP4 quantization:** FP8 supports deployment from workstation (RTX 6000, DGX Spark) to cloud (H100, H200, B200, A100, L40S); NVFP4 requires Blackwell hardware.
* **9x higher throughput** than other open omni models at the same interactivity level.
* **\~20% higher multimodal intelligence** compared to the best open alternative.
* **Post-trained with multi-environment reinforcement learning** via NVIDIA NeMo RL and NeMo Gym across text, image, audio, and video environments, improving instruction following and convergence to correct multimodal answers.

**Modalities:** Input: text, image, video, audio — Output: text

**Supported GPUs:** NVIDIA B200, H100, H200, A100, L40S, DGX Spark, RTX 6000

Available model variants on HuggingFace:

* [`nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning`](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning)
* [`nvidia/Nemotron-3-Nano-Omni-30B-A3B-BF16`](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-BF16)
* [`nvidia/Nemotron-3-Nano-Omni-30B-A3B-FP8`](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-FP8)
* [`nvidia/Nemotron-3-Nano-Omni-30B-A3B-NVFP4`](https://huggingface.co/nvidia/Nemotron-3-Nano-Omni-30B-A3B-NVFP4)

**Agentic workloads this model enables:**

* **Computer Use Agent:** Perception loop for agents navigating GUIs — reads screens, understands UI state over time, validates outcomes. Collapses vision and reasoning into a single loop.
* **Document Intelligence:** Interprets documents, charts, tables, screenshots, and mixed media inputs for enterprise analysis and compliance workflows.
* **Audio & Video Understanding Agents:** Maintains continuous audio-video context for customer service, research, and monitoring workflows, tying what was said, shown, and documented into a single reasoning stream.

## 2. SGLang Installation

Install SGLang via pip or from source:

```shell Command theme={null}
# Install via pip
pip install sglang

# Or install from source
uv pip install 'git+https://github.com/sgl-project/sglang.git#subdirectory=python'

# Or use Docker
docker pull lmsysorg/sglang:dev-cu13-nemotronh-nano-omni-reasoning-v3
```

For the full Docker setup and other installation methods, refer to the [official SGLang installation guide](../../../docs/get-started/installation).

## 3. Model Deployment

This section provides a progressive guide from quick deployment to performance tuning.

### 3.1 Basic Configuration

**Interactive Command Generator**: select hardware, model variant, and common knobs to generate a launch command.

<Nemotron3NanoOmniDeployment />

### 3.2 Configuration Tips

* **Attention backend:**

  **H100/H200:** Use flash attention 3 backend by default.
  **B200:** Use flashinfer backend by default.

* **TP support:**

  To set tensor parallelism, use `--tp <1|2|4|8>`. A 4×H100 setup is recommended for the BF16/Reasoning variant.

* **FP8 KV cache:**

  To enable FP8 KV cache, append `--kv-cache-dtype fp8_e4m3`. FP8 KV cache trades a small amount of accuracy for memory; omit the flag if you observe accuracy regressions on your workload.

* **Reasoning parser:**

  Append `--reasoning-parser deepseek-r1` to enable structured reasoning traces (`reasoning_content` field in the response).

* **Tool calling:**

  Append `--tool-call-parser qwen3_coder` to enable tool calling support.

## 4. Model Invocation

The command below launches the server for a 4×H100 setup with reasoning and tool calling enabled. See [Section 4.8](#48-fp8-and-nvfp4-deployment) for FP8 and NVFP4 variants.

```shell Command theme={null}
sglang serve \
  --model-path nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning \
  --host 0.0.0.0 \
  --port 30000 \
  --tp 4 \
  --trust-remote-code \
  --tool-call-parser qwen3_coder \
  --reasoning-parser deepseek-r1
```

### 4.1 Basic Usage (Text)

SGLang provides an OpenAI-compatible endpoint. Example with the OpenAI Python client:

```python Example theme={null}
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

resp = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "Give me 3 bullet points about SGLang."},
    ],
    temperature=0.6,
    max_tokens=512,
)
print(resp.choices[0].message.reasoning_content, resp.choices[0].message.content)
```

Output:

```text Output theme={null}
Reasoning: SGLang is a serving framework I know from my training data. Let me recall the key features...

Content:
- **Radix Attention** — SGLang reuses KV cache across requests sharing a common prefix, dramatically reducing memory and compute for multi-turn and few-shot workloads.
- **OpenAI-compatible API** — Drop-in replacement for the OpenAI Python client; no application code changes required to serve a locally-hosted model.
- **High-throughput serving** — Continuous batching, chunked prefill, and optimized CUDA kernels deliver state-of-the-art throughput on NVIDIA GPUs across A100, H100, and B200.
```

Streaming chat completion:

```python Example theme={null}
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

stream = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "What are the first 5 prime numbers?"},
    ],
    temperature=0.6,
    max_tokens=512,
    stream=True,
)
for chunk in stream:
    delta = chunk.choices[0].delta
    if delta and delta.content:
        print(delta.content, end="", flush=True)
```

### 4.2 Image Understanding

Pass image inputs using the OpenAI vision format. Supports both URLs and base64-encoded images:

```python Example theme={null}
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

# From URL
resp = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg"},
                },
                {"type": "text", "text": "Describe this image in detail."},
            ],
        }
    ],
    temperature=0.6,
    max_tokens=512,
)
print(resp.choices[0].message.reasoning_content)
print(resp.choices[0].message.content)
```

For local images, encode as base64:

```python Example theme={null}
import base64
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

with open("screenshot.png", "rb") as f:
    image_b64 = base64.b64encode(f.read()).decode("utf-8")

resp = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
                },
                {"type": "text", "text": "What UI elements are visible on this screen? What action would you take next?"},
            ],
        }
    ],
    temperature=0.6,
    max_tokens=512,
)
print(resp.choices[0].message.content)
```

### 4.3 Video Understanding

Nemotron 3 Nano Omni uses Conv3D layers and Efficient Video Sampling (EVS) for temporal-spatial video reasoning, processing longer videos at the same compute budget:

```python Example theme={null}
import base64
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

with open("video.mp4", "rb") as f:
    video_b64 = base64.b64encode(f.read()).decode("utf-8")

resp = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "video_url",
                    "video_url": {"url": f"data:video/mp4;base64,{video_b64}"},
                },
                {"type": "text", "text": "Summarize what happens in this video step by step."},
            ],
        }
    ],
    temperature=0.6,
    max_tokens=1024,
)
print(resp.choices[0].message.reasoning_content)
print(resp.choices[0].message.content)
```

### 4.4 Audio Understanding

Pass audio inputs as base64-encoded WAV or MP3 data:

```python Example theme={null}
import base64
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

with open("audio.wav", "rb") as f:
    audio_b64 = base64.b64encode(f.read()).decode("utf-8")

resp = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_audio",
                    "input_audio": {"data": audio_b64, "format": "wav"},
                },
                {"type": "text", "text": "Transcribe and summarize what was said in this audio."},
            ],
        }
    ],
    temperature=0.6,
    max_tokens=512,
)
print(resp.choices[0].message.content)
```

### 4.5 Mixed Multimodal Input

Combine modalities in a single request. For example, an image alongside an audio question about it:

```python Example theme={null}
import base64
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

with open("chart.png", "rb") as f:
    image_b64 = base64.b64encode(f.read()).decode("utf-8")

resp = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
                },
                {"type": "text", "text": "Analyze this chart. What are the key trends and what conclusion does the data support?"},
            ],
        }
    ],
    temperature=0.6,
    max_tokens=1024,
)
print(resp.choices[0].message.reasoning_content)
print(resp.choices[0].message.content)
```

### 4.6 Reasoning

The model supports two modes — Reasoning ON (default) vs OFF. Toggle per-request by setting `enable_thinking` to `False`:

```python Example theme={null}
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

# Reasoning ON (default)
print("Reasoning on")
resp = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the derivative of x^3 sin(x)?"},
    ],
    temperature=0.6,
    max_tokens=1024,
)
print(f"Reasoning:\n{resp.choices[0].message.reasoning_content[:300]}...\nContent:\n{resp.choices[0].message.content}")
print("\n")

# Reasoning OFF
print("Reasoning off")
resp = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is 15% of 200?"},
    ],
    temperature=0.6,
    max_tokens=256,
    extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
print(f"Content:\n{resp.choices[0].message.content}")
```

Output:

```text Output theme={null}
Reasoning on
Reasoning:
The user wants the derivative of x^3 sin(x). I'll apply the product rule: d/dx[u·v] = u'v + uv'. Here u = x^3, v = sin(x). So u' = 3x^2, v' = cos(x). The result is 3x^2·sin(x) + x^3·cos(x)...
Content:
Using the product rule: d/dx[x³ sin(x)] = 3x² sin(x) + x³ cos(x)


Reasoning off
Content:
15% of 200 is **30**.
```

### 4.7 Tool Calling

Call functions using the OpenAI Tools schema. The server must be launched with `--tool-call-parser qwen3_coder`:

```python Example theme={null}
from openai import OpenAI

SERVED_MODEL_NAME = "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning"
client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "City and state, e.g. San Francisco, CA",
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                    },
                },
                "required": ["location"],
            },
        },
    }
]

completion = client.chat.completions.create(
    model=SERVED_MODEL_NAME,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the weather like in Santa Clara, CA?"},
    ],
    tools=TOOLS,
    temperature=0.6,
    top_p=0.95,
    max_tokens=512,
    stream=False,
)
print(completion.choices[0].message.reasoning_content)
print(completion.choices[0].message.tool_calls)
```

Output:

```text Output theme={null}
The user is asking about weather in Santa Clara, CA. I have a get_weather function that takes a location and optional unit. I should call it with location="Santa Clara, CA".

[ChatCompletionMessageFunctionToolCall(id='call_abc123', function=Function(arguments='{"location": "Santa Clara, CA", "unit": "fahrenheit"}', name='get_weather'), type='function', index=0)]
```

### 4.8 FP8 and NVFP4 Deployment

**FP8 variant** (recommended for throughput-critical serving on H100/H200/B200):

```shell Command theme={null}
sglang serve \
  --model-path nvidia/Nemotron-3-Nano-Omni-30B-A3B-FP8 \
  --host 0.0.0.0 \
  --port 30000 \
  --tp 4 \
  --trust-remote-code \
  --tool-call-parser qwen3_coder \
  --reasoning-parser deepseek-r1
```

**NVFP4 variant** (maximum efficiency on Blackwell B200):

```shell Command theme={null}
sglang serve \
  --model-path nvidia/Nemotron-3-Nano-Omni-30B-A3B-NVFP4 \
  --host 0.0.0.0 \
  --port 30000 \
  --tp 2 \
  --trust-remote-code \
  --tool-call-parser qwen3_coder \
  --reasoning-parser deepseek-r1
```

***

## 5. Benchmark

### 5.1 Efficiency Benchmark

Nemotron 3 Nano Omni achieves **9x higher throughput** than other open omni models at the same interactivity level, delivering lower cost and better scalability without sacrificing responsiveness. It also achieves **\~20% higher multimodal intelligence** compared to the best open alternative across image, video, and audio reasoning tasks.

### 5.2 Speed Benchmark

**Test Environment:**

* Hardware: B200 (8×)
* Model: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning
* Tensor Parallelism: 4
* SGLang Version: main branch

Model Deployment Command:

```shell Command theme={null}
sglang serve \
  --model-path nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning \
  --trust-remote-code \
  --tp 4 \
  --max-running-requests 1024 \
  --host 0.0.0.0 \
  --attention-backend flashinfer \
  --port 30000
```

Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 30000 \
  --model nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning \
  --dataset-name random \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 4096 \
  --max-concurrency 256
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 256
Successful requests:                     4096
Benchmark duration (s):                  206.52
Total input tokens:                      2081726
Total input text tokens:                 2081726
Total generated tokens:                  2087288
Total generated tokens (retokenized):    1945477
Request throughput (req/s):              19.83
Input token throughput (tok/s):          10080.25
Output token throughput (tok/s):         10107.18
Peak output token throughput (tok/s):    20199.00
Peak concurrent requests:                291
Total token throughput (tok/s):          20187.44
Concurrency:                             250.83
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   12646.47
Median E2E Latency (ms):                 12371.84
P90 E2E Latency (ms):                    22889.81
P99 E2E Latency (ms):                    26528.70
---------------Time to First Token----------------
Mean TTFT (ms):                          220.66
Median TTFT (ms):                        97.67
P99 TTFT (ms):                           2068.63
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          24.98
Median TPOT (ms):                        24.36
P99 TPOT (ms):                           44.97
---------------Inter-Token Latency----------------
Mean ITL (ms):                           24.43
Median ITL (ms):                         10.91
P95 ITL (ms):                            62.68
P99 ITL (ms):                            100.60
Max ITL (ms):                            2171.93
==================================================
```

### 5.3 Accuracy Benchmark

**Environment**

* Hardware: B200 (8×)
* Model: nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning
* Tensor Parallelism: 4
* SGLang Version: main branch

**Launch Model**

```shell Command theme={null}
sglang serve \
  --model-path nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning \
  --trust-remote-code \
  --tp 4 \
  --attention-backend flashinfer \
  --reasoning-parser deepseek-r1
```

#### 5.3.1 GSM8K Benchmark

**Run Benchmark**

```shell Command theme={null}
python3 benchmark/gsm8k/bench_sglang.py --port 30000
```

**Test Results:**

```text Output theme={null}
Accuracy: 0.830
Invalid: 0.000
Latency: 13.970 s
Output throughput: 1611.623 token/s
```

#### 5.3.2 MMLU Benchmark

**Run Benchmark**

```shell Command theme={null}
python3 benchmark/mmlu/bench_sglang.py --port 30000
```

**Test Results:**

```text Output theme={null}
subject: abstract_algebra, #q:100, acc: 0.510
subject: anatomy, #q:135, acc: 0.711
subject: astronomy, #q:152, acc: 0.829
subject: business_ethics, #q:100, acc: 0.760
subject: clinical_knowledge, #q:265, acc: 0.781
subject: college_biology, #q:144, acc: 0.854
subject: college_chemistry, #q:100, acc: 0.560
subject: college_computer_science, #q:100, acc: 0.700
subject: college_mathematics, #q:100, acc: 0.590
subject: college_medicine, #q:173, acc: 0.775
subject: college_physics, #q:102, acc: 0.559
subject: computer_security, #q:100, acc: 0.750
subject: conceptual_physics, #q:235, acc: 0.821
subject: econometrics, #q:114, acc: 0.605
subject: electrical_engineering, #q:145, acc: 0.759
subject: elementary_mathematics, #q:378, acc: 0.638
subject: formal_logic, #q:126, acc: 0.524
subject: global_facts, #q:100, acc: 0.400
subject: high_school_biology, #q:310, acc: 0.906
subject: high_school_chemistry, #q:203, acc: 0.759
subject: high_school_computer_science, #q:100, acc: 0.860
subject: high_school_european_history, #q:165, acc: 0.812
subject: high_school_geography, #q:198, acc: 0.889
subject: high_school_government_and_politics, #q:193, acc: 0.933
subject: high_school_macroeconomics, #q:390, acc: 0.785
subject: high_school_mathematics, #q:270, acc: 0.496
subject: high_school_microeconomics, #q:238, acc: 0.887
subject: high_school_physics, #q:151, acc: 0.675
subject: high_school_psychology, #q:545, acc: 0.895
subject: high_school_statistics, #q:216, acc: 0.731
subject: high_school_us_history, #q:204, acc: 0.858
subject: high_school_world_history, #q:237, acc: 0.873
subject: human_aging, #q:223, acc: 0.740
subject: human_sexuality, #q:131, acc: 0.855
subject: international_law, #q:121, acc: 0.851
subject: jurisprudence, #q:108, acc: 0.815
subject: logical_fallacies, #q:163, acc: 0.847
subject: machine_learning, #q:112, acc: 0.598
subject: management, #q:103, acc: 0.864
subject: marketing, #q:234, acc: 0.910
subject: medical_genetics, #q:100, acc: 0.880
subject: miscellaneous, #q:783, acc: 0.881
subject: moral_disputes, #q:346, acc: 0.780
subject: moral_scenarios, #q:895, acc: 0.543
subject: nutrition, #q:306, acc: 0.814
subject: philosophy, #q:311, acc: 0.733
subject: prehistory, #q:324, acc: 0.852
subject: professional_accounting, #q:282, acc: 0.553
subject: professional_law, #q:1534, acc: 0.565
subject: professional_medicine, #q:272, acc: 0.779
subject: professional_psychology, #q:612, acc: 0.760
subject: public_relations, #q:110, acc: 0.709
subject: security_studies, #q:245, acc: 0.759
subject: sociology, #q:201, acc: 0.831
subject: us_foreign_policy, #q:100, acc: 0.910
subject: virology, #q:166, acc: 0.560
subject: world_religions, #q:171, acc: 0.807
Total latency: 67.512
Average accuracy: 0.737
```
