> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# GPT-OSS

export const GPTOSSDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'b200',
        label: 'B200',
        default: true
      }, {
        id: 'b300',
        label: 'B300',
        default: false
      }, {
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'h100',
        label: 'H100',
        default: false
      }, {
        id: 'mi300x',
        label: 'MI300X',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }, {
        id: 'xeon',
        label: 'XEON',
        default: false
      }]
    },
    modelsize: {
      name: 'modelsize',
      title: 'Model Size',
      items: [{
        id: '120b',
        label: '120B',
        subtitle: 'MOE',
        default: true
      }, {
        id: '20b',
        label: '20B',
        subtitle: 'MOE',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      items: [{
        id: 'mxfp4',
        label: 'MXFP4',
        default: true
      }, {
        id: 'bf16',
        label: 'BF16',
        default: false
      }]
    },
    reasoningParser: {
      name: 'reasoningParser',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    },
    speculative: {
      name: 'speculative',
      title: 'Speculative Decoding',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    }
  };
  const getDisplayOptions = values => ({
    ...options,
    quantization: options.quantization,
    speculative: {
      ...options.speculative,
      items: options.speculative.items.map(item => ({
        ...item,
        disabled: values.hardware === 'xeon' && item.id === 'enabled'
      }))
    }
  });
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = option.items.filter(item => item.default).map(item => item.id);
      } else {
        const defaultItem = option.items.find(item => item.default);
        initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
      }
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => {
      const next = {
        ...prev,
        [optionName]: value
      };
      if (optionName === 'hardware' && value === 'xeon') {
        next.speculative = 'disabled';
      }
      return next;
    });
  };
  const generateCommand = () => {
    const {hardware, modelsize, quantization, reasoningParser, toolcall, speculative} = values;
    const modelConfigs = {
      '120b': {
        baseName: '120b',
        h100: {
          tp: 8
        },
        h200: {
          tp: 8
        },
        b200: {
          tp: 8
        },
        b300: {
          tp: 8
        },
        mi300x: {
          tp: 8
        },
        mi325x: {
          tp: 8
        },
        mi355x: {
          tp: 8
        },
        xeon: {
          tp: 3
        }
      },
      '20b': {
        baseName: '20b',
        h100: {
          tp: 1
        },
        h200: {
          tp: 1
        },
        b200: {
          tp: 1
        },
        b300: {
          tp: 1
        },
        mi300x: {
          tp: 1
        },
        mi325x: {
          tp: 1
        },
        mi355x: {
          tp: 1
        },
        xeon: {
          tp: 3
        }
      }
    };
    const config = modelConfigs[modelsize];
    if (!config) {
      return `# Error: Unknown model size: ${modelsize}`;
    }
    const hwConfig = config[hardware];
    if (!hwConfig) {
      return `# Error: Unknown hardware platform: ${hardware}`;
    }
    const quantSuffix = quantization === 'bf16' ? '-bf16' : '';
    const orgPrefix = quantization === 'bf16' ? 'lmsys' : 'openai';
    const modelName = `${orgPrefix}/gpt-oss-${config.baseName}${quantSuffix}`;
    let cmd = '';
    if ((hardware === 'mi300x' || hardware === 'mi325x' || hardware === 'mi355x') && speculative === 'enabled') {
      return '# MI30x GPUs Speculative Decoding: Work In Progress';
    }
    if ((hardware === 'mi300x' || hardware === 'mi325x') && quantization === 'mxfp4') {
      return '# MI300X/MI325X GPUs with MXFP4 quantization: Work In Progress';
    }
    if (hardware === 'mi300x' || hardware === 'mi325x' || hardware === 'mi355x') {
      cmd += 'SGLANG_USE_AITER=0 ';
    }
    if (speculative === 'enabled') {
      cmd += 'SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 ';
    }
    cmd += 'python -m sglang.launch_server \\\n';
    cmd += `  --model ${modelName}`;
    if (hardware === 'xeon') {
      cmd += ` \\
  --device cpu \\
  --disable-overlap-schedule`;
    }
    if (hwConfig.tp > 1) {
      cmd += ` \\\n  --tp ${hwConfig.tp}`;
    }
    if (reasoningParser === 'enabled') {
      cmd += ` \\\n  --reasoning-parser gpt-oss`;
    }
    if (toolcall === 'enabled') {
      cmd += ` \\\n  --tool-call-parser gpt-oss`;
    }
    if (hardware === 'b300') {
      cmd += ` \\\n  --attention-backend triton`;
      cmd += ` \\\n  --moe-runner-backend triton`;
      cmd += ` \\\n  --enforce-disable-flashinfer-allreduce-fusion`;
    }
    if (speculative === 'enabled') {
      cmd += ` \\\n  --speculative-algorithm EAGLE3 \\\n  --speculative-num-steps 3 \\\n  --speculative-eagle-topk 1 \\\n  --speculative-num-draft-tokens 4`;
      if (modelsize === '120b') {
        cmd += ` \\\n  --speculative-draft-model-path nvidia/gpt-oss-120b-Eagle3`;
      } else if (modelsize === '20b') {
        cmd += ` \\\n  --speculative-draft-model-path zhuyksir/EAGLE3-gpt-oss-20b-bf16`;
      }
    }
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(getDisplayOptions(values)).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.type === 'checkbox' ? option.items.map(item => {
    const isChecked = (values[option.name] || []).includes(item.id);
    const isDisabled = item.required;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                    <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={e => handleCheckboxChange(option.name, item.id, e.target.checked)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  }) : option.items.map(item => {
    const isChecked = values[option.name] === item.id;
    const isDisabled = Boolean(item.disabled);
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1.Model Introduction

[GPT-OSS](https://huggingface.co/openai/gpt-oss-20b) is an advanced large language model developed by OpenAI designed for power reasoning, agentic tasks, and versatile developer use cases. It has versions with two model sizes.

* **gpt-oss-120b** — for production, general purpose, high reasoning use cases that fit into a single 80GB GPU (like NVIDIA H100 80GB or AMD MI300X 192GB) (117B parameters with 5.1B active parameters)
* **gpt-oss-20b** — for lower latency, and local or specialized use cases (21B parameters with 3.6B active parameters)

GPT-OSS introduces several groundbreaking innovations:

* **Configurable reasoning effort**: Easily adjust the reasoning effort (low, medium, high) based on your specific use case and latency needs.
* **Full chain-of-thought**: Gain complete access to the model’s reasoning process, facilitating easier debugging and increased trust in outputs. It’s not intended to be shown to end users.
* **Fine-tunable**: Fully customize models to your specific use case through parameter fine-tuning.
* **Agentic capabilities**: Use the models’ native capabilities for function calling, web browsing, Python code execution, and Structured Outputs.
* **MXFP4 quantization**: The models were post-trained with MXFP4 quantization of the MoE weights, making gpt-oss-120b run on a single 80GB GPU (like NVIDIA H100 80GB or AMD MI300X 192GB) and the gpt-oss-20b model run within 16GB of memory. All evals were performed with the same MXFP4 quantization.

## 2.SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

For SGLang CPU installation, please refer to the [CPU version installation guide](../../../docs/hardware-platforms/cpu_server#installation).

## 3.Model Deployment

This section provides deployment configurations optimized for different hardware platforms and use cases.

### 3.1 Basic Configuration

The GPT-OSS series comes in two sizes. Recommended starting configurations vary depending on hardware.

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model size, quantization method, and thinking capabilities.

<GPTOSSDeployment />

### 3.2 Configuration Tips

* **Native web search:** Set `EXA_API_KEY` in the SGLang server environment to enable built-in web search (Exa). No `--tool-server` is required, and requests are tagged with `x-exa-integration: sglang`.
* **Web search defaults:** `numResults=10`, search `type="auto"`, and `contents.highlights=true`. Override with `SGLANG_EXA_NUM_RESULTS`, `SGLANG_EXA_SEARCH_TYPE`, and `SGLANG_EXA_INCLUDE_HIGHLIGHTS`.
* **Python tool:** Add `--tool-server demo` to enable the Python interpreter. Runs in a Docker sandbox by default; set `PYTHON_EXECUTION_BACKEND=UV` to run on the host (model-generated code executes locally — use with care).
* **MCP tool servers:** For production, point SGLang at external MCP SSE servers with `--tool-server ip-1:port-1,ip-2:port-2`.
* **Responses API:** GPT-OSS supports OpenAI's Responses API (`client.responses.create`) in addition to the standard Chat Completions API (see section 4.2.4).
* **Use Python 3.12** when running the demo Python tool.
* **Xeon CPU service configuration:** Please refer to the `Notes` part in the serving engine launching section in [the SGLang CPU server document](../../../docs/hardware-platforms/cpu_server#launch-of-the-serving-engine) to better understand how to configure the arguments, especially for TP (tensor parallel) and NUMA binding settings.

## 4.Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)

### 4.2 Advanced Usage

#### 4.2.1 Reasoning Parser

GPT-OSS supports reasoning mode. Enable the reasoning parser during deployment to separate the thinking and content sections:

```shell Command theme={null}
python -m sglang.launch_server \
  --model openai/gpt-oss-120b \
  --reasoning-parser gpt-oss \
  --tp 8
```

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="EMPTY"
)

# Enable streaming to see the thinking process in real-time
response = client.chat.completions.create(
    model="openai/gpt-oss-120b",
    messages=[
        {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
    ],
    temperature=0.7,
    max_tokens=2048,
    stream=True
)

# Process the stream
has_thinking = False
has_answer = False
thinking_started = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print answer content
        if delta.content:
            # Close thinking section and add content header
            if has_thinking and not has_answer:
                print("\n=============== Content =================", flush=True)
                has_answer = True
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
The user asks: "Solve this problem step by step: What is 15% of 240?" So we need to provide step-by-step solution. Compute 15% of 240: 0.15 * 240 = 36. Provide steps: convert percent to decimal, multiply, maybe use fraction. Provide answer.
=============== Content =================
**Step‑by‑step solution**

1. **Understand what “percent” means**
   “15 %” means 15 out of every 100 parts, i.e. the fraction \(\displaystyle \frac{15}{100}\).

2. **Convert the percent to a decimal (or fraction)**
   \[
   \frac{15}{100}=0.15
   \]

3. **Set up the multiplication**
   To find 15 % of 240 we multiply 240 by the decimal 0.15:
   \[
   240 \times 0.15
   \]

4. **Do the multiplication**
   One convenient way is to break it into two easier parts:
   \[
   240 \times 0.15 = 240 \times \left(\frac{15}{100}\right)
                = \frac{240 \times 15}{100}
   \]

   - First compute \(240 \times 15\):
     \[
     240 \times 15 = 240 \times (10 + 5) = 2400 + 1200 = 3600
     \]

   - Then divide by 100:
     \[
     \frac{3600}{100} = 36
     \]

5. **Write the result**
   \[
   15\% \text{ of } 240 = 36
   \]

---

**Answer:** \(36\)
```

#### 4.2.2 Tool Calling

GPT-OSS supports tool calling capabilities. Enable the tool call parser:

**Python Example (without Thinking Process):**

Start sglang server:

```shell Command theme={null}
python -m sglang.launch_server \
  --model openai/gpt-oss-120b \
  --tool-call-parser gpt-oss \
  --tp 8
```

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Make request with streaming to see thinking process
response = client.chat.completions.create(
    model="openai/gpt-oss-120b",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    temperature=0.7,
    stream=True
)

# Process streaming response
thinking_started = False
has_thinking = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print tool calls
        if hasattr(delta, 'tool_calls') and delta.tool_calls:
            # Close thinking section if needed
            if has_thinking and thinking_started:
                print("\n=============== Content =================", flush=True)
                thinking_started = False

            for tool_call in delta.tool_calls:
                if tool_call.function:
                    print(f"🔧 Tool Call: {tool_call.function.name}")
                    print(f"   Arguments: {tool_call.function.arguments}")

        # Print content
        if delta.content:
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
🔧 Tool Call: get_weather
   Arguments: {"location": "Beijing", "unit": "celsius"}
```

**Python Example (with Thinking Process):**

Start sglang server:

```shell Command theme={null}
python -m sglang.launch_server \
  --model openai/gpt-oss-120b \
  --reasoning-parser gpt-oss \
  --tool-call-parser gpt-oss \
  --tp 8
```

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Make request with streaming to see thinking process
response = client.chat.completions.create(
    model="openai/gpt-oss-120b",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    temperature=0.7,
    stream=True
)

# Process streaming response
thinking_started = False
has_thinking = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print tool calls
        if hasattr(delta, 'tool_calls') and delta.tool_calls:
            # Close thinking section if needed
            if has_thinking and thinking_started:
                print("\n=============== Content =================", flush=True)
                thinking_started = False

            for tool_call in delta.tool_calls:
                if tool_call.function:
                    print(f"🔧 Tool Call: {tool_call.function.name}")
                    print(f"   Arguments: {tool_call.function.arguments}")

        # Print content
        if delta.content:
            print(delta.content, end="", flush=True)

print()
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
User asks: "What's the weather in Beijing?" We need to get current weather. Use function get_weather with location "Beijing". No unit specified; default? Probably use default (maybe Celsius). We can specify unit as "celsius". We'll call function.
=============== Content =================
🔧 Tool Call: get_weather
   Arguments: {"location": "Beijing", "unit": "celsius"}
```

**Note:**

* The reasoning parser shows how the model decides to use a tool
* Tool calls are clearly marked with the function name and arguments
* You can then execute the function and send the result back to continue the conversation

**Handling Tool Call Results:**

```python Example theme={null}
# After getting the tool call, execute the function
def get_weather(location, unit="celsius"):
    # Your actual weather API call here
    return f"The weather in {location} is 22°{unit[0].upper()} and sunny."

# Send tool result back to the model
messages = [
    {"role": "user", "content": "What's the weather in Beijing?"},
    {
        "role": "assistant",
        "content": None,
        "tool_calls": [{
            "id": "call_123",
            "type": "function",
            "function": {
                "name": "get_weather",
                "arguments": '{"location": "Beijing", "unit": "celsius"}'
            }
        }]
    },
    {
        "role": "tool",
        "tool_call_id": "call_123",
        "content": get_weather("Beijing", "celsius")
    }
]

final_response = client.chat.completions.create(
    model="openai/gpt-oss-120b",
    messages=messages,
    temperature=0.7
)

print(final_response.choices[0].message.content)
# Output: "The current weather in Beijing is 22 °C and sunny. Let me know if you’d like a forecast for the next few days or any other details!"
```

#### 4.2.3 EAGLE3 Speculative Decoding

SGLang supports speculative decoding for GPT-OSS models using the EAGLE3 algorithm. This can significantly improve decoding speed, especially for small batch sizes.

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path openai/gpt-oss-120b \
  --speculative-algorithm EAGLE3 \
  --speculative-draft-model-path lmsys/EAGLE3-gpt-oss-120b-bf16 \
  --tp 2
```

<Tip>
  The spec-v2 overlap scheduler is enabled by default. It improves performance by overlapping draft and verification stages. Pass `--disable-overlap-schedule` to disable.
</Tip>

#### 4.2.4 Responses API and Built-in Tools

GPT-OSS supports the OpenAI Responses API with built-in tool use (web search and Python interpreter). Set `EXA_API_KEY` to enable native web search; add `--tool-server demo` only when you also want the Python tool:

```shell Command theme={null}
export EXA_API_KEY=YOUR_EXA_KEY
# Optional: server-side Exa tuning (defaults shown)
export SGLANG_EXA_NUM_RESULTS=10
export SGLANG_EXA_SEARCH_TYPE=auto
export SGLANG_EXA_INCLUDE_HIGHLIGHTS=true
# Optional: run Python tool on host instead of Docker (model code executes locally)
export PYTHON_EXECUTION_BACKEND=UV

python3 -m sglang.launch_server \
  --model-path openai/gpt-oss-120b \
  --tp 2
```

For production, use external MCP SSE servers instead of `demo`:

```shell Command theme={null}
mcp run -t sse browser_server.py:mcp
mcp run -t sse python_server.py:mcp

python -m sglang.launch_server \
  --model-path openai/gpt-oss-120b \
  --tool-server ip-1:port-1,ip-2:port-2 \
  --tp 2
```

**Example using Responses API:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(base_url="http://localhost:30000/v1", api_key="sk-123456")

search_tools = [{"type": "web_search"}]
python_tools = [{"type": "code_interpreter"}]

# Configurable reasoning effort: "high", "medium", or "low"
response = client.responses.create(
    model="openai/gpt-oss-120b",
    instructions="You are a helpful assistant.",
    reasoning_effort="high",
    input="In one sentence, explain the transformer architecture.",
)
print(response.output_text)

# Web search (requires EXA_API_KEY on the SGLang server)
response = client.responses.create(
    model="openai/gpt-oss-120b",
    instructions="You are a helpful assistant, you can search the web when needed.",
    input="Search the web for the latest news about Nvidia stock price",
    tools=search_tools,
)
print(response.output_text)

# Python tool (requires launching SGLang with --tool-server demo)
response = client.responses.create(
    model="openai/gpt-oss-120b",
    instructions="You are a helpful assistant, you could use python tool to execute code.",
    input="Use python tool to calculate the sum of 29138749187 and 29138749187",
    tools=python_tools,
)
print(response.output_text)
# Output: The sum is 58,277,498,374.
```

## 5.Benchmark

### 5.1 Speed Benchmark

* Hardware: NVIDIA B200 GPU (8x)
* Tensor Parallelism: 8
* Model: openai/gpt-oss-120b
* sglang version: 0.5.6

We use SGLang's built-in benchmarking tool to conduct performance evaluation on the [ShareGPT\_Vicuna\_unfiltered](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered) dataset. This dataset contains real conversation data and can better reflect performance in actual use scenarios.

#### 5.1.1 Latency-Sensitive Benchmark

* Server Command:

```shell Command theme={null}
python -m sglang.launch_server \
  --model openai/gpt-oss-120b \
  --tp 8
```

* Test Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --num-prompt 100 \
  --max-concurrency 1
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     100
Benchmark duration (s):                  52.35
Total input tokens:                      33178
Total input text tokens:                 33178
Total input vision tokens:               0
Total generated tokens:                  21251
Total generated tokens (retokenized):    20868
Request throughput (req/s):              1.91
Input token throughput (tok/s):          633.76
Output token throughput (tok/s):         405.93
Peak output token throughput (tok/s):    433.00
Peak concurrent requests:                8
Total token throughput (tok/s):          1039.69
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   523.30
Median E2E Latency (ms):                 389.91
---------------Time to First Token----------------
Mean TTFT (ms):                          33.71
Median TTFT (ms):                        31.79
P99 TTFT (ms):                           108.98
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          2.31
Median TPOT (ms):                        2.31
P99 TPOT (ms):                           2.39
---------------Inter-Token Latency----------------
Mean ITL (ms):                           2.31
Median ITL (ms):                         2.31
P95 ITL (ms):                            2.35
P99 ITL (ms):                            2.38
Max ITL (ms):                            3.54
==================================================
```

#### 5.1.2 Throughput-Sensitive Benchmark

* Server Command:

```shell Command theme={null}
python -m sglang.launch_server \
  --model openai/gpt-oss-120b \
  --tp 8
```

* Test Command:

```shell Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --num-prompt 1000 \
  --max-concurrency 100
```

**Test Results:**

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     1000
Benchmark duration (s):                  24.76
Total input tokens:                      297156
Total input text tokens:                 297156
Total input vision tokens:               0
Total generated tokens:                  192432
Total generated tokens (retokenized):    187145
Request throughput (req/s):              40.39
Input token throughput (tok/s):          12003.57
Output token throughput (tok/s):         7773.26
Peak output token throughput (tok/s):    13780.00
Peak concurrent requests:                156
Total token throughput (tok/s):          19776.83
Concurrency:                             89.23
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   2208.97
Median E2E Latency (ms):                 1591.11
---------------Time to First Token----------------
Mean TTFT (ms):                          102.94
Median TTFT (ms):                        31.53
P99 TTFT (ms):                           674.32
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          14.31
Median TPOT (ms):                        11.00
P99 TPOT (ms):                           91.28
---------------Inter-Token Latency----------------
Mean ITL (ms):                           11.00
Median ITL (ms):                         5.75
P95 ITL (ms):                            25.35
P99 ITL (ms):                            43.18
Max ITL (ms):                            621.42
==================================================
```

### 5.2 Accuracy Benchmark

### 5.2.1 GSM8K Benchmark

* **Benchmark Command:**

```shell Command theme={null}
python3 -m sglang.test.few_shot_gsm8k --num-questions 200 --port 8000
```

* **Results**:

  * GPT-OSS-120b

    ```text Output theme={null}
    Accuracy: 0.880
    Invalid: 0.005
    Latency: 5.262 s
    Output throughput: 12143.675 token/s
    ```

  * GPT-OSS-20b

    ```text Output theme={null}
    Accuracy: 0.535
    Invalid: 0.165
    Latency: 4.157 s
    Output throughput: 19589.165 token/s
    ```