> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Nemotron3-Nano

export const Nemotron3NanoDeployment = () => {
  const modelFamily = 'nvidia';
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'b200',
        label: 'B200',
        default: true
      }]
    },
    modelVariant: {
      name: 'modelVariant',
      title: 'Model Variant',
      items: [{
        id: 'bf16',
        label: 'BF16',
        default: true
      }, {
        id: 'fp8',
        label: 'FP8',
        default: false
      }, {
        id: 'nvfp4',
        label: 'NVFP4',
        default: false
      }]
    },
    tp: {
      name: 'tp',
      title: 'Tensor Parallel (TP)',
      items: [{
        id: '1',
        label: 'TP=1',
        default: true
      }, {
        id: '2',
        label: 'TP=2',
        default: false
      }, {
        id: '4',
        label: 'TP=4',
        default: false
      }, {
        id: '8',
        label: 'TP=8',
        default: false
      }]
    },
    kvcache: {
      name: 'kvcache',
      title: 'KV Cache DType',
      items: [{
        id: 'fp8_e4m3',
        label: 'fp8_e4m3',
        default: true
      }, {
        id: 'bf16',
        label: 'bf16',
        default: false
      }]
    },
    thinking: {
      name: 'thinking',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--reasoning-parser nemotron_3' : null
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--tool-call-parser qwen3_coder' : null
    }
  };
  const generateCommand = values => {
    const {hardware, modelVariant, tp, kvcache, thinking, toolcall} = values;
    const variant = modelVariant || 'fp8';
    const baseName = 'NVIDIA-Nemotron-3-Nano-30B-A3B';
    const modelName = `${modelFamily}/${baseName}-${variant.toUpperCase()}`;
    let cmd = 'python3 -m sglang.launch_server \\\n';
    cmd += `  --model-path ${modelName} \\\n`;
    cmd += `  --trust-remote-code \\\n`;
    cmd += `  --tp ${tp} \\\n`;
    cmd += `  --kv-cache-dtype ${kvcache} \\\n`;
    for (const [key, option] of Object.entries(options)) {
      if (option.commandRule) {
        const rule = option.commandRule(values[key]);
        if (rule) {
          cmd += `  ${rule}  \\\n`;
        }
      }
    }
    cmd = cmd.trimEnd();
    if (cmd.endsWith('\\')) {
      cmd = cmd.slice(0, -1).trimEnd();
    }
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      let items = option.items || [];
      if (option.getDynamicItems) {
        const defaultValues = {};
        Object.entries(options).forEach(([innerKey, innerOption]) => {
          if (innerOption.type === 'checkbox') {
            defaultValues[innerKey] = (innerOption.items || []).filter(item => item.default).map(item => item.id);
          } else if (innerOption.type === 'text') {
            defaultValues[innerKey] = innerOption.default || '';
          } else if (innerOption.items && innerOption.items.length > 0) {
            const defaultItem = innerOption.items.find(item => item.default);
            defaultValues[innerKey] = defaultItem ? defaultItem.id : innerOption.items[0].id;
          }
        });
        items = option.getDynamicItems(defaultValues);
      }
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

`NVIDIA Nemotron3-Nano` is a 30B-parameter hybrid LLM that mixes Mixture-of-Experts (MoE) feed-forward layers, Mamba2 sequence-modeling layers, and standard self-attention layers in a single stack rather than classic “attention + MLP” transformer blocks.

The BF16 variant (`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16`) is designed as a high-fidelity reference model. For optimized inference performance on modern NVIDIA GPUs, the FP8 variant (`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8`) and the NVFP4 variant (`nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4`) are supported.

At a high level:

* **Hybrid layer stack (Mamba2 + MoE + attention):** The network is composed of interleaved layers that are *either* Mamba2, *or* MoE feed-forward, *or* attention-only.
* **Non-uniform layer ordering:** The order and mix of these specialized layers is not a simple, rigid pattern, enabling the model to trade off sequence modeling, routing capacity, and expressivity across depth.
* **Deployment-friendly precision:** Use BF16 for accuracy-sensitive and evaluation workloads; use FP8 for latency- and throughput-critical serving on recent NVIDIA GPUs.

## 2. SGLang Installation

Refer to the [official SGLang installation guide](../../../docs/get-started/install), or install nightly wheel through:

```bash Command theme={null}
uv pip install sglang==0.5.6.post3.dev1278+gad1b4e472 --extra-index-url https://sgl-project.github.io/whl/nightly/
```

## 3. Model Deployment

This section provides a progressive guide from quick deployment to performance tuning.

### 3.1 Basic Configuration

**Interactive Command Generator**: select hardware, model variant, and common knobs to generate a launch command.

<Nemotron3NanoDeployment />

### 3.2 Configuration Tips

* **Attention backend**:

  **H200**: Use flash attention 3 backend by default.
  **B200**: Use flashinfer backend by default.

* **TP support**:

  To set tp size, use `--tp <1|2|4|8>`.

* **FP8 KV cache**:

  To enable fp8 kv cache, please append `--kv-cache-dtype fp8_e4m3`.

## 4. Model Invocation

### 4.1 Basic Usage (OpenAI-Compatible API)

SGLang provides an OpenAI-compatible endpoint. Example with the OpenAI Python client:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY",
)

resp = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Summarize what MoE models are in 5 bullets."},
    ],
    temperature=0.7,
    max_tokens=256,
)

print(resp.choices[0].message.content)

```

Streaming chat completion

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY",
)

stream = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
    messages=[
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "What are the first 5 prime numbers?"}
    ],
    temperature=0.7,
    max_tokens=1024,
    stream=True,
)
for chunk in stream:
    delta = chunk.choices[0].delta
    if delta and delta.content:
        print(delta.content, end="", flush=True)
```

### 4.2 Reasoning

To enable reasoning, `--reasoning-parser nemotron_3` should be appended to the launching command. The model supports two modes - Reasoning ON (default) vs OFF. This can be toggled by setting enable\_thinking to False, as shown below.

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY",
)

# Reasoning on (default)
print("Reasoning on")
resp = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write a haiku about GPUs."}
    ],
    temperature=0.7,
    max_tokens=512,
)
print(resp.choices[0].message.reasoning_content)

# Reasoning off
print("Reasoning off")
resp = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write a haiku about GPUs."}
    ],
    temperature=0.6,
    max_tokens=256,
    extra_body={"chat_template_kwargs": {"enable_thinking": False}}
)
print(resp.choices[0].message.reasoning_content)

```

### 4.3 Tool calling

To enable reasoning, `--tool-call-parser qwen3_coder` should be appended to the launching command. Call functions using the OpenAI Tools schema and inspect returned tool\_calls.

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY",
)

# Tool calling via OpenAI tools schema
TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "calculate_tip",
            "parameters": {
                "type": "object",
                "properties": {
                    "bill_total": {
                        "type": "integer",
                        "description": "The total amount of the bill"
                    },
                    "tip_percentage": {
                        "type": "integer",
                        "description": "The percentage of tip to be applied"
                    }
                },
                "required": ["bill_total", "tip_percentage"]
            }
        }
    }
]

completion = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
    messages=[
        {"role": "system", "content": ""},
        {"role": "user", "content": "My bill is $50. What will be the amount for 15% tip?"}
    ],
    tools=TOOLS,
    temperature=0.6,
    top_p=0.95,
    max_tokens=512,
    stream=False
)

print(completion.choices[0].message.reasoning_content)
print(completion.choices[0].message.tool_calls)
```

***

## 5. Benchmark

### 5.1 Speed Benchmark

**Test Environment:**

* Hardware: NVIDIA B200 GPU

**FP8 variant**

* Model Deployment Command:

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 \
  --trust-remote-code \
  --max-running-requests 1024 \
  --host 0.0.0.0 \
  --port 30000
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 30000 \
  --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8 \
  --dataset-name random \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 4096 \
  --max-concurrency 256
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 256
Successful requests:                     4096
Benchmark duration (s):                  183.18
Total input tokens:                      2081726
Total input text tokens:                 2081726
Total input vision tokens:               0
Total generated tokens:                  2116125
Total generated tokens (retokenized):    1076256
Request throughput (req/s):              22.36
Input token throughput (tok/s):          11364.25
Output token throughput (tok/s):         11552.04
Peak output token throughput (tok/s):    24692.00
Peak concurrent requests:                294
Total token throughput (tok/s):          22916.30
Concurrency:                             251.19
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   11233.74
Median E2E Latency (ms):                 11142.97
---------------Time to First Token----------------
Mean TTFT (ms):                          172.99
Median TTFT (ms):                        116.57
P99 TTFT (ms):                           1193.68
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          21.74
Median TPOT (ms):                        21.14
P99 TPOT (ms):                           41.12
---------------Inter-Token Latency----------------
Mean ITL (ms):                           21.45
Median ITL (ms):                         9.06
P95 ITL (ms):                            62.59
P99 ITL (ms):                            110.83
Max ITL (ms):                            5368.19
==================================================
```

**BF16 variant**

* Model Deployment Command:

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \
  --trust-remote-code \
  --max-running-requests 1024 \
  --host 0.0.0.0 \
  --port 30000
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --host 127.0.0.1 \
  --port 30000 \
  --model nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \
  --dataset-name random \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 4096 \
  --max-concurrency 256
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 256
Successful requests:                     4096
Benchmark duration (s):                  360.22
Total input tokens:                      2081726
Total input text tokens:                 2081726
Total input vision tokens:               0
Total generated tokens:                  2087288
Total generated tokens (retokenized):    1940652
Request throughput (req/s):              11.37
Input token throughput (tok/s):          5779.10
Output token throughput (tok/s):         5794.55
Peak output token throughput (tok/s):    9169.00
Peak concurrent requests:                276
Total token throughput (tok/s):          11573.65
Concurrency:                             249.76
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   21965.10
Median E2E Latency (ms):                 21706.35
---------------Time to First Token----------------
Mean TTFT (ms):                          211.54
Median TTFT (ms):                        93.06
P99 TTFT (ms):                           2637.66
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          43.27
Median TPOT (ms):                        43.04
P99 TPOT (ms):                           61.15
---------------Inter-Token Latency----------------
Mean ITL (ms):                           42.77
Median ITL (ms):                         28.46
P95 ITL (ms):                            71.85
P99 ITL (ms):                            113.20
Max ITL (ms):                            5237.28
==================================================

```

### 5.2 Accuracy Benchmark

#### 5.2.1 GSM8K Benchmark

**Environment**

* Hardware: NVIDIA B200 GPU
* Model: BF16 checkpoint

**Launch Model**

```bash Command theme={null}
python3 -m sglang.launch_server \
  --model-path nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 \
  --trust-remote-code \
  --reasoning-parser nemotron_3
```

**Run Benchmark with lm-eval**

```bash Command theme={null}
pip install lm-eval[api]==0.4.9.2

lm_eval --model local-completions --tasks gsm8k --model_args "model=nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16,base_url=http://127.0.0.1:30000/v1/completions,num_concurrent=4,max_retries=3,tokenized_requests=False,max_lengths=16384" --gen_kwargs '{"chat_template_kwargs":{"thinking":true}}' --batch_size 256
```

**Test Results:**

```text Output theme={null}
|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.5603|±  |0.0137|
|     |       |strict-match    |     5|exact_match|↑  |0.8453|±  |0.0100|
```
