> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# NVIDIA Nemotron3-Ultra

> Deploy NVIDIA Nemotron3-Ultra with SGLang - 550B hybrid MoE model (55B active) with 1M context window, BF16/NVFP4 support, built for long-running autonomous agents.

export const Nemotron3UltraDeployment = () => {
  const MODEL_PATHS = {
    bf16: 'nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16',
    nvfp4: 'nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4'
  };
  const VERIFIED_CONFIGS = [{
    model: 'bf16',
    hardware: 'h100',
    tp: '16',
    multinode: true
  }, {
    model: 'bf16',
    hardware: 'h200',
    tp: '16',
    multinode: true
  }, {
    model: 'bf16',
    hardware: 'b200',
    tp: '8'
  }, {
    model: 'bf16',
    hardware: 'b300',
    tp: '8'
  }, {
    model: 'nvfp4',
    hardware: 'b200',
    tp: '4'
  }, {
    model: 'nvfp4',
    hardware: 'b200',
    tp: '8'
  }, {
    model: 'nvfp4',
    hardware: 'b300',
    tp: '4'
  }, {
    model: 'nvfp4',
    hardware: 'b300',
    tp: '8'
  }, {
    model: 'nvfp4',
    hardware: 'gb200',
    tp: '4'
  }, {
    model: 'nvfp4',
    hardware: 'gb300',
    tp: '4'
  }];
  const findVerified = (model, hardware, tp) => VERIFIED_CONFIGS.find(c => c.model === model && c.hardware === hardware && c.tp === tp);
  const verifiedHardwareForModel = model => [...new Set(VERIFIED_CONFIGS.filter(c => c.model === model).map(c => c.hardware))];
  const verifiedTpForModelHardware = (model, hardware) => [...new Set(VERIFIED_CONFIGS.filter(c => c.model === model && c.hardware === hardware).map(c => c.tp))];
  const dpCandidatesForModel = model => model === 'bf16' ? ['2'] : ['2', '4', '8'];
  const maxVerifiedTpForModelHardware = (model, hardware) => {
    const tps = verifiedTpForModelHardware(model, hardware).map(Number);
    return tps.length ? Math.max(...tps) : 0;
  };
  const verifiedDpForModelHardwareTp = (model, hardware, tp) => {
    const cap = Math.min(Number(tp) || 0, maxVerifiedTpForModelHardware(model, hardware));
    return dpCandidatesForModel(model).filter(d => Number(d) <= cap);
  };
  const options = {
    model: {
      name: 'model',
      title: 'Model',
      items: [{
        id: 'bf16',
        label: 'BF16',
        default: false
      }, {
        id: 'nvfp4',
        label: 'NVFP4',
        default: true,
        subtitle: 'Blackwell only'
      }]
    },
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      getDynamicItems: values => {
        const supported = new Set(verifiedHardwareForModel(values.model));
        const base = [{
          id: 'h100',
          label: 'H100',
          default: false
        }, {
          id: 'h200',
          label: 'H200',
          default: false
        }, {
          id: 'b200',
          label: 'B200',
          default: true
        }, {
          id: 'gb200',
          label: 'GB200',
          default: false
        }, {
          id: 'b300',
          label: 'B300',
          default: false
        }, {
          id: 'gb300',
          label: 'GB300',
          default: false
        }];
        return base.map(it => {
          const ok = supported.has(it.id);
          return {
            ...it,
            disabled: !ok,
            disabledReason: ok ? '' : `${values.model.toUpperCase()} is not verified on ${it.label}`
          };
        });
      }
    },
    tp: {
      name: 'tp',
      title: 'Tensor Parallel (TP)',
      getDynamicItems: values => {
        const supported = new Set(verifiedTpForModelHardware(values.model, values.hardware));
        const base = [{
          id: '4',
          label: 'TP=4'
        }, {
          id: '8',
          label: 'TP=8'
        }, {
          id: '16',
          label: 'TP=16',
          subtitle: '2-node'
        }];
        return base.map(it => {
          const ok = supported.has(it.id);
          return {
            ...it,
            default: ok && supported.size === 1,
            disabled: !ok,
            disabledReason: ok ? '' : `TP=${it.id} is not verified for ${values.model.toUpperCase()} on ${values.hardware.toUpperCase()}`
          };
        });
      }
    },
    ep: {
      name: 'ep',
      title: 'Expert Parallel (EP)',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        subtitle: 'EP = TP'
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: true
      }],
      commandRule: (value, state) => value === 'enabled' ? `--ep ${state.tp}` : null
    },
    dpattention: {
      name: 'dpattention',
      title: 'DP Attention',
      getDynamicItems: values => {
        const allowed = new Set(verifiedDpForModelHardwareTp(values.model, values.hardware, values.tp));
        const base = [{
          id: 'disabled',
          label: 'Disabled',
          subtitle: 'Low latency',
          default: true
        }, {
          id: '2',
          label: 'DP=2',
          subtitle: 'High throughput'
        }, {
          id: '4',
          label: 'DP=4',
          subtitle: 'High throughput'
        }, {
          id: '8',
          label: 'DP=8',
          subtitle: 'High throughput'
        }];
        return base.map(it => {
          if (it.id === 'disabled') return it;
          const ok = allowed.has(it.id);
          return {
            ...it,
            disabled: !ok,
            disabledReason: ok ? '' : `DP=${it.id} is not verified for ${values.model.toUpperCase()} on ${values.hardware.toUpperCase()} at TP=${values.tp}`
          };
        });
      },
      commandRule: (value, state) => value && value !== 'disabled' && dpCandidatesForModel(state.model).includes(value) && Number(value) <= Number(state.tp) ? `--dp ${value} \\\n  --enable-dp-attention` : null
    },
    mtp: {
      name: 'mtp',
      title: 'Multi-token Prediction (MTP)',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        default: true
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--speculative-algorithm EAGLE \\\n  --speculative-num-steps 3 \\\n  --speculative-eagle-topk 1 \\\n  --speculative-num-draft-tokens 4' : null
    },
    kvcache: {
      name: 'kvcache',
      title: 'KV Cache DType',
      items: [{
        id: 'none',
        label: 'None',
        default: true
      }, {
        id: 'fp8_e4m3',
        label: 'fp8_e4m3',
        default: false
      }, {
        id: 'bf16',
        label: 'bf16',
        default: false
      }]
    },
    thinking: {
      name: 'thinking',
      title: 'Reasoning Parser',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        default: true
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--reasoning-parser nemotron_3' : null
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        default: true
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--tool-call-parser qwen3_coder' : null
    }
  };
  const renderVerifiedMatrix = () => {
    const byModel = {};
    for (const c of VERIFIED_CONFIGS) {
      (byModel[c.model] ||= []).push(c);
    }
    return Object.entries(byModel).map(([m, cs]) => {
      const lines = cs.map(c => {
        const node = c.multinode ? ', 2-node' : '';
        return `#     - ${c.hardware.toUpperCase()} @ TP=${c.tp}${node}`;
      });
      return `#   ${m.toUpperCase()}:\n${lines.join('\n')}`;
    }).join('\n');
  };
  const generateCommand = values => {
    const {tp, kvcache, model, hardware} = values;
    const cfg = findVerified(model, hardware, tp);
    if (!cfg) {
      return [`# ERROR: ${model.toUpperCase()} on ${hardware.toUpperCase()} with TP=${tp} is not a verified configuration.`, `# The launch command has been suppressed to avoid running an unvalidated setup.`, `#`, `# Verified configurations:`, renderVerifiedMatrix()].join('\n');
    }
    const modelPath = MODEL_PATHS[model] || MODEL_PATHS['bf16'];
    let cmd = `python3 -m sglang.launch_server \\\n`;
    cmd += `  --model-path ${modelPath} \\\n`;
    cmd += `  --trust-remote-code \\\n`;
    cmd += `  --tp ${tp} \\\n`;
    for (const [key, option] of Object.entries(options)) {
      if (option.commandRule) {
        const rule = option.commandRule(values[key], values);
        if (rule) {
          cmd += `  ${rule} \\\n`;
        }
      }
    }
    cmd += `  --mamba-scheduler-strategy extra_buffer \\\n`;
    if (['b200', 'gb200', 'b300', 'gb300'].includes(hardware)) {
      cmd += `  --attention-backend trtllm_mha \\\n`;
    }
    if (kvcache && kvcache !== 'none') {
      cmd += `  --kv-cache-dtype ${kvcache} \\\n`;
    }
    if (cfg.multinode) {
      cmd += `  --dist-init-addr <head-node-ip>:5000 \\\n`;
      cmd += `  --nnodes 2 \\\n`;
      cmd += `  --node-rank <0|1> \\\n`;
    }
    cmd = cmd.trimEnd();
    if (cmd.endsWith('\\')) {
      cmd = cmd.slice(0, -1).trimEnd();
    }
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      const items = option.getDynamicItems ? option.getDynamicItems(initialState) : option.items || [];
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

`NVIDIA Nemotron3-Ultra` is an open frontier reasoning model in the Nemotron 3 family, built for long-running autonomous agents. It is optimized for complex orchestration across coding, deep research, enterprise workflows, and EDA use cases where agents must sustain reasoning across many steps and large context windows.

Nemotron 3 Ultra is a 550B parameter hybrid MoE model that activates only 55B parameters per forward pass, delivering frontier reasoning accuracy with high-throughput inference. It supports a 1M token context window so agents can keep conversation history, tool outputs, and plan state in view across persistent workflows.

Architecture and key features:

* **Hybrid Transformer-Mamba Architecture (MoE):** Combines Mixture of Experts with a hybrid Transformer-Mamba architecture, enabling efficient routing and sequence modeling in a single stack.
* **Long-horizon agentic reasoning:** Tuned for agents that plan, call tools, inspect results, recover from failures, and continue working across long task horizons — coding, deep research, enterprise automation, and EDA.
* **1M token context window:** Sustains coherent agent state across extended workflows without re-ingestion.
* **BF16 and NVFP4 quantization:** Deployable from multi-node H100 down to a single Blackwell node with NVFP4.
* **Multi-environment RL post-training:** Post-trained with reinforcement learning across multiple environments for robust reasoning and reliable agentic behavior.
* **Open weights, open data, open recipes:** Customizable for domain-specific agents and deployable across your own infrastructure.

**Modalities:** Input: text — Output: text

**Supported GPUs:**

* **BF16:** 16×H100, 16×H200, 8×B200/B300
* **NVFP4:** 4/8×B200/B300, 4×GB200/GB300

Available model variants on HuggingFace:

* [`nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16`](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16)
* [`nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4`](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4)

## 2. SGLang Installation

Nemotron3-Ultra support has not yet propagated to `lmsysorg/sglang:latest` or any stable release. Pull one of the two dedicated images below — matching your CUDA version — to get a runtime with Nemotron3-Ultra support.

```bash Command theme={null}
# CUDA 13
docker pull lmsysorg/sglang:dev-nemotron3-ultra

# CUDA 12
docker pull lmsysorg/sglang:dev-cu12-nemotron3-ultra
```

## 3. Model Deployment

This section provides a progressive guide from quick deployment to performance tuning.

### 3.1 Basic Configuration

**Interactive Command Generator**: select model precision, hardware, tensor parallelism, and common knobs to generate a launch command.

The generator only emits a runnable command for combinations that NVIDIA / SGLang have validated. Selecting an unverified tuple (e.g. NVFP4 on H100/H200, BF16 with TP=4 on H100, …) is **blocked** — the command pane shows an explicit error and the verified support matrix instead of a launch line, so unvalidated commands can't be copied by accident.

<Nemotron3UltraDeployment />

### 3.2 Configuration Tips

* **Attention backend**:

  **H100/H200**: Use flash attention 3 backend by default.
  **B200/GB200/B300/GB300**: Append `--attention-backend trtllm_mha`. The flashinfer default breaks the overlap scheduler on Blackwell, so `trtllm_mha` is required there.

* **Mamba scheduler strategy**:

  Always launch with `--mamba-scheduler-strategy extra_buffer`. This hybrid Transformer-Mamba model requires the `extra_buffer` strategy for correct scheduling of its Mamba state.

* **TP support**:

  To set tp size, use `--tp <4|8|16>`. Recommended pairings:

  * BF16: `--tp 16` on H100/H200, `--tp 8` on B200/B300
  * NVFP4: `--tp 4` or `--tp 8` on B200/B300, `--tp 4` on GB200/GB300

* **Multi-node BF16 on H100**:

  The 16×H100 BF16 setup spans two nodes. Use `--dist-init-addr <head-node-ip>:5000 --nnodes 2 --node-rank <0|1>` on each node and keep `--tp 16`.

* **DP attention**:

  By default the attention layers are tensor-parallel (sharded across all TP ranks). Enabling DP attention (the toggle above, or `--dp <N> --enable-dp-attention`) instead runs attention as `N` data-parallel groups: each DP rank serves its own slice of the requests with its own KV cache. `--dp` must divide `--tp`.

* **Expert parallel (EP)**:

  This MoE only supports `ep_size == 1` (off) or `ep_size == tp_size`. To enable expert parallelism, append `--ep <tp>` with the same value as `--tp`.

* **Multi-token prediction (MTP)**:

  Enable MTP speculative decoding (the toggle above) for low latency.

* **FP8 KV cache**:

  To enable fp8 kv cache, please append `--kv-cache-dtype fp8_e4m3`.

* **Reasoning parser**:

  Append `--reasoning-parser nemotron_3` to enable structured reasoning traces (`reasoning_content` field in the response).

* **Tool calling**:

  Append `--tool-call-parser qwen3_coder` to enable tool calling support.

## 4. Model Invocation

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16 \
  --trust-remote-code \
  --tp 8 \
  --mamba-scheduler-strategy extra_buffer \
  --attention-backend trtllm_mha \
  --tool-call-parser qwen3_coder \
  --reasoning-parser nemotron_3
```

### 4.1 Basic Usage (OpenAI-Compatible API)

SGLang provides an OpenAI-compatible endpoint. Example with the OpenAI Python client:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY",
)

resp = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
    messages=[
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "Give me 3 bullet points about SGLang."},
    ],
    temperature=0.6,
    max_tokens=1024,
)
print("Reasoning:", resp.choices[0].message.reasoning_content, "\nContent:", resp.choices[0].message.content)
print("\n")
```

Output:

```text Output theme={null}
Reasoning: The user wants 3 bullet points about SGLang. Let me recall what I know about SGLang — it's a high-performance serving framework for large language models with a focus on structured generation and efficient KV cache reuse...(more tokens)

Content: - **Radix Attention** — SGLang reuses KV cache across requests sharing a common prefix, dramatically reducing memory and compute for multi-turn agent loops and few-shot workloads.
- **OpenAI-compatible API and structured generation** — Drop-in replacement for the OpenAI client, with first-class support for constrained decoding (JSON schema, regex) and OpenAI-style tool calling.
- **High-throughput serving on NVIDIA GPUs** — Continuous batching, chunked prefill, FP8/NVFP4 quantization, and optimized CUDA kernels deliver state-of-the-art throughput across H100, H200, B200, and GB200.
```

Streaming chat completion:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY",
)

stream = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
    messages=[
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "What are the first 5 prime numbers?"}
    ],
    temperature=0.7,
    max_tokens=1024,
    stream=True,
)
for chunk in stream:
    delta = chunk.choices[0].delta
    if delta and delta.content:
        print(delta.content, end="", flush=True)
```

Output:

```text Output theme={null}
The first 5 prime numbers are:
**2, 3, 5, 7, 11**.

### Explanation:
- A **prime number** is a natural number greater than 1 whose only positive divisors are 1 and itself.
- **2** is the smallest prime and the only even prime.
- **3, 5, 7, 11** are each divisible only by 1 and themselves.
- **1** is not prime by definition (it has only one positive divisor).
- **4, 6, 8, 9, 10** are composite.
```

### 4.2 Reasoning

The model supports two modes — Reasoning ON (default) vs OFF. This can be toggled by setting `enable_thinking` to `False`, as shown below.

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY",
)

# Reasoning on (default)
print("Reasoning on")
resp = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Plan a 3-step approach to debug a flaky integration test. Keep the thinking process short."}
    ],
    temperature=1,
    max_tokens=1024,
)
print(f"Reasoning: \n{resp.choices[0].message.reasoning_content[:200]}... \nContent: \n{resp.choices[0].message.content[:200]}...")
print("\n")
# Reasoning off
print("Reasoning off")
resp = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Give me 3 facts about SGLang."}
    ],
    temperature=0,
    max_tokens=256,
    extra_body={"chat_template_kwargs": {"enable_thinking": False}}
)
print(f"Content: \n{resp.choices[0].message.content[:200]}...")
```

Output:

```text Output theme={null}
Reasoning on
Reasoning:
The user wants a short reasoning chain plus a 3-step debug plan for a flaky integration test. I'll think briefly about common causes (timing/race, shared state, external service variance) and pick a t...
Content:
1. **Reproduce deterministically** — run the test in a loop (e.g. 50–100x) with logging at the suspected race points to confirm the failure rate and surface ordering.
2. **Isolate state** — re-run with...

Reasoning off
Content:
Here are 3 facts about SGLang:

1. **High-performance LLM serving system** developed at UC Berkeley with contributions from a broad open-source community, focused on throughput and latency at scale.
...
```

### 4.3 Tool Calling

Call functions using the OpenAI Tools schema and inspect returned `tool_calls`. The server must be launched with `--tool-call-parser qwen3_coder`.

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY",
)

# Tool calling via OpenAI tools schema
TOOLS = [
    {
        "type": "function",
        "function": {
            "name": "search_codebase",
            "description": "Search the project codebase for a symbol or pattern.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {
                        "type": "string",
                        "description": "The symbol, function name, or regex to search for"
                    },
                    "path": {
                        "type": "string",
                        "description": "Optional sub-path to restrict the search to"
                    }
                },
                "required": ["query"]
            }
        }
    }
]

completion = client.chat.completions.create(
    model="nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16",
    messages=[
        {"role": "system", "content": "You are a coding agent. Use tools to inspect the repo before answering."},
        {"role": "user", "content": "Where is the `RadixCache` class defined?"}
    ],
    tools=TOOLS,
    temperature=0.6,
    top_p=0.95,
    max_tokens=512,
    stream=False
)

print(completion.choices[0].message.reasoning_content)
print(completion.choices[0].message.tool_calls)
```

Output:

```text Output theme={null}
The user is asking where the RadixCache class is defined. I should search the codebase for the symbol "RadixCache" to find the file and line. I'll call search_codebase with that query.

[ChatCompletionMessageFunctionToolCall(id='call_8a7f2c4e1b9d4a3e8c2f1d6b', function=Function(arguments='{"query": "class RadixCache"}', name='search_codebase'), type='function', index=0)]
```

### 4.4 Controlling Reasoning Budget

The `reasoning_budget` parameter allows you to limit the length of the model's reasoning trace. When the reasoning output reaches the specified token budget, the model will attempt to gracefully end the reasoning at the next newline character.

If no newline is encountered within 500 tokens after reaching the budget threshold, the reasoning trace will be forcibly terminated at `reasoning_budget + 500` tokens.

```python Example theme={null}
from typing import Any, Dict, List
import openai
from transformers import AutoTokenizer

class ThinkingBudgetClient:
    def __init__(self, base_url: str, api_key: str, tokenizer_name_or_path: str):
        self.base_url = base_url
        self.api_key = api_key
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
        self.client = openai.OpenAI(base_url=self.base_url, api_key=self.api_key)

    def chat_completion(
        self,
        model: str,
        messages: List[Dict[str, Any]],
        reasoning_budget: int = 512,
        max_tokens: int = 1024,
        **kwargs,
    ) -> Dict[str, Any]:
        assert (
            max_tokens > reasoning_budget
        ), f"reasoning_budget must be smaller than max_tokens. Given {max_tokens=} and {reasoning_budget=}"

        # 1. first call chat completion to get reasoning content
        response = self.client.chat.completions.create(
            model=model,
            messages=messages,
            max_tokens=reasoning_budget,
            **kwargs
        )

        reasoning_content = response.choices[0].message.reasoning_content or ""

        if "</think>" not in reasoning_content:
            # reasoning content is too long, closed with a period (.)
            reasoning_content = f"{reasoning_content}.\n</think>\n\n"

        reasoning_tokens_used = len(
            self.tokenizer.encode(reasoning_content, add_special_tokens=False)
        )
        remaining_tokens = max_tokens - reasoning_tokens_used

        assert (
            remaining_tokens > 0
        ), f"remaining tokens must be positive. Given {remaining_tokens=}. Increase max_tokens or lower reasoning_budget."

        # 2. append reasoning content to messages and call completion
        messages.append({"role": "assistant", "content": reasoning_content})
        prompt = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            continue_final_message=True,
        )

        response = self.client.completions.create(
            model=model,
            prompt=prompt,
            max_tokens=remaining_tokens,
            **kwargs
        )

        response_data = {
            "reasoning_content": reasoning_content.strip().strip("</think>").strip(),
            "content": response.choices[0].text,
            "finish_reason": response.choices[0].finish_reason,
        }
        return response_data
```

Usage example with `reasoning_budget=256`:

```python Example theme={null}
SERVED_MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-BF16"

# Client
client = ThinkingBudgetClient(
    base_url="http://127.0.0.1:30000/v1",
    api_key="null",
    tokenizer_name_or_path=SERVED_MODEL_NAME
)

resp = client.chat_completion(
    model=SERVED_MODEL_NAME,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Outline a research plan to evaluate the throughput of two MoE serving strategies."}
    ],
    temperature=1,
    max_tokens=1024,
    reasoning_budget=256
)
print("Reasoning:", resp["reasoning_content"], "\nContent:", resp["content"])
```

Output:

```text Output theme={null}
Reasoning: The user wants a research plan to compare throughput of two MoE serving strategies. I should outline goals, baselines, datasets, metrics (tokens/s, TTFT, ITL, MFU), variables to sweep (TP, batch size, sequence length, concurrency), and statistical handling. Keep it concise since reasoning_budget is 256...
Content:

**Research plan**

1. **Define goal & metrics** — peak token throughput (input+output), TTFT, P99 ITL, MFU; measured at fixed accuracy.
2. **Choose baselines** — Strategy A (TP-only) vs Strategy B (TP + expert-parallel). Hold model checkpoint, precision, and KV-cache dtype constant.
3. **Sweep** — `{batch ∈ 1,4,16,64, concurrency ∈ 16,64,256, seq_len ∈ 1k,8k,32k}` per strategy.
4. **Workload** — `sglang.bench_serving --dataset-name random` with matched input/output budgets.
5. **Analysis** — per-config throughput table + roofline overlay; bootstrap CIs over 3 reruns to bound noise.
```

***

## 5. Benchmark

### 5.1 Speed Benchmark

**Test Environment:**

* Hardware: GB200 (4x)

* Model: nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4

* Tensor Parallelism: 4

* SGLang Version: main branch

* Model Deployment Command:

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4 \
  --trust-remote-code \
  --tp 4 \
  --mamba-scheduler-strategy extra_buffer \
  --attention-backend trtllm_mha \
  --max-running-requests 1024
```

* Benchmark Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4 \
  --dataset-name random \
  --random-input-len 1024 \
  --random-output-len 1024 \
  --num-prompts 4096 \
  --max-concurrency 256
```

* **Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 256
Successful requests:                     4096
Benchmark duration (s):                  1184.58
Total input tokens:                      2081726
Total input text tokens:                 2081726
Total generated tokens:                  2087288
Total generated tokens (retokenized):    1990224
Request throughput (req/s):              3.46
Input token throughput (tok/s):          1757.35
Output token throughput (tok/s):         1762.05
Peak output token throughput (tok/s):    3150.00
Peak concurrent requests:                266
Total token throughput (tok/s):          3519.40
Concurrency:                             249.55
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   72169.95
Median E2E Latency (ms):                 71994.47
P90 E2E Latency (ms):                    99898.56
P99 E2E Latency (ms):                    107119.61
---------------Time to First Token----------------
Mean TTFT (ms):                          40057.33
Median TTFT (ms):                        41375.93
P99 TTFT (ms):                           46377.89
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          63.15
Median TPOT (ms):                        63.65
P99 TPOT (ms):                           78.16
---------------Inter-Token Latency----------------
Mean ITL (ms):                           63.14
Median ITL (ms):                         35.92
P95 ITL (ms):                            178.10
P99 ITL (ms):                            182.10
Max ITL (ms):                            2466.36
==================================================
```

### 5.2 Accuracy Benchmark

#### 5.2.1 GSM8K Benchmark

**Environment**

* Hardware: GB200 (4x)
* Model: nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4
* Tensor Parallelism: 4
* SGLang Version: main branch

**Launch Model**

```bash Command theme={null}
python3 -m sglang.launch_server \
  --model-path nvidia/NVIDIA-Nemotron-3-Ultra-550B-A55B-NVFP4 \
  --trust-remote-code \
  --tp 4 \
  --mamba-scheduler-strategy extra_buffer \
  --attention-backend trtllm_mha \
  --reasoning-parser nemotron_3
```

**Run Benchmark**

```bash Command theme={null}
python3 benchmark/gsm8k/bench_sglang.py
```

**Test Results:**

```text Output theme={null}
Accuracy: 0.970
Invalid: 0.000
Latency: 29.129 s
Output throughput: 745.333 token/s
```

#### 5.2.2 MMLU Benchmark

**Run Benchmark**

```bash Command theme={null}
python3 benchmark/mmlu/bench_sglang.py
```

**Test Results:**

```text Output theme={null}
TBD
```

***
