> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Hunyuan 3 Preview

export const Hunyuan3PreviewDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'h200',
        label: 'H200',
        default: true
      }, {
        id: 'b200',
        label: 'B200',
        default: false
      }, {
        id: 'b300',
        label: 'B300',
        default: false
      }, {
        id: 'gb300',
        label: 'GB300',
        default: false
      }, {
        id: 'xeon',
        label: 'XEON',
        default: false
      }]
    },
    reasoning: {
      name: 'reasoning',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: false
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: true
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: false
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: true
      }]
    },
    speculative: {
      name: 'speculative',
      title: 'Speculative Decoding (MTP)',
      getDynamicItems: values => {
        const isXeon = values && values.hardware === 'xeon';
        return [{
          id: 'disabled',
          label: 'Disabled',
          default: true
        }, {
          id: 'enabled',
          label: 'Enabled',
          subtitle: 'Low Latency',
          default: false,
          disabled: isXeon,
          disabledReason: isXeon ? 'Speculative decoding (MTP) is not supported on Intel Xeon CPUs' : ''
        }];
      }
    }
  };
  const modelConfigs = {
    h200: {
      tp: 8,
      mem: 0.9
    },
    b200: {
      tp: 8,
      mem: 0.9
    },
    b300: {
      tp: 4,
      mem: 0.9
    },
    gb300: {
      tp: 4,
      mem: 0.9
    },
    xeon: {
      tp: 6
    }
  };
  const resolveItems = (option, values) => {
    if (typeof option.getDynamicItems === 'function') return option.getDynamicItems(values);
    return option.items;
  };
  const getInitialState = () => {
    const initialState = {};
    for (const [key, option] of Object.entries(options)) {
      const items = resolveItems(option, initialState);
      const def = items.find(i => i.default && !i.disabled) || items.find(i => !i.disabled) || items[0];
      initialState[key] = def.id;
    }
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => {
      const next = {
        ...prev,
        [optionName]: value
      };
      if (optionName === 'hardware') {
        for (const [key, option] of Object.entries(options)) {
          if (key === 'hardware') continue;
          const items = resolveItems(option, next);
          const current = items.find(i => i.id === next[key]);
          if (!current || current.disabled) {
            const fallback = items.find(i => i.default && !i.disabled) || items.find(i => !i.disabled);
            if (fallback) next[key] = fallback.id;
          }
        }
      }
      return next;
    });
  };
  const generateCommand = () => {
    const {hardware} = values;
    const isBlackwell = hardware === 'b200' || hardware === 'b300' || hardware === 'gb300';
    const isXeon = hardware === 'xeon';
    const hwConfig = modelConfigs[hardware];
    if (!hwConfig) return '# Configuration not available for the selected hardware.';
    const modelName = 'tencent/Hy3-preview';
    const tpValue = hwConfig.tp;
    const memFraction = hwConfig.mem;
    const enableSpec = values.speculative === 'enabled' && !isXeon;
    let cmd = '';
    if (enableSpec) cmd += 'SGLANG_ENABLE_SPEC_V2=1 ';
    cmd += 'sglang serve \\\n';
    cmd += `  --model-path ${modelName}`;
    cmd += ` \\\n  --tp ${tpValue}`;
    if (values.reasoning === 'enabled') cmd += ' \\\n  --reasoning-parser hunyuan';
    if (values.toolcall === 'enabled') cmd += ' \\\n  --tool-call-parser hunyuan';
    if (enableSpec) {
      cmd += ' \\\n  --speculative-algorithm EAGLE';
      cmd += ' \\\n  --speculative-num-steps 3';
      cmd += ' \\\n  --speculative-eagle-topk 1';
      cmd += ' \\\n  --speculative-num-draft-tokens 4';
    }
    cmd += ' \\\n  --trust-remote-code';
    if (memFraction !== undefined) cmd += ` \\\n  --mem-fraction-static ${memFraction}`;
    if (isBlackwell && !isXeon) cmd += ' \\\n  --attention-backend trtllm_mha';
    if (isXeon) cmd += ' \\\n  --device cpu \\\n  --disable-overlap-schedule';
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.4
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (typeof option.condition === 'function' && !option.condition(values)) return null;
    const items = resolveItems(option, values);
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = !!item.disabled;
      return <label key={item.id} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }} title={item.disabledReason || ''}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                    {item.label}
                    {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>{item.subtitle}</small>}
                  </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

Hunyuan 3 Preview (Hy3-preview) is Tencent's preview of its third-generation flagship MoE language model, featuring hybrid thinking, native tool calling, long-context reasoning, and Multi-Token Prediction (MTP) for low-latency serving.

**Key Features:**

* **MoE Architecture**: 192 routed experts + 1 shared expert, 8 experts activated per token. \~276B total parameters with \~20B active, delivering dense-model quality at MoE inference cost.
* **Hybrid Thinking**: Reasoning modes (`high`, `medium`, `low`, `none`) controllable via OpenAI-standard `reasoning_effort`, allowing the same weights to trade off latency and depth of reasoning.
* **Native Tool Calling**: Trained on structured `<tool_call>` / `<arg_key>` / `<arg_value>` grammar. Pairs with SGLang's `hunyuan` tool-call parser for streaming OpenAI-compatible function-calling output.
* **Long Context**: 256K token context window (262,144 positions) for repository-scale code and document reasoning.
* **Multi-Token Prediction (MTP)**: Ships with a built-in MTP draft module enabling speculative decoding out of the box.

**Available Models:**

* [tencent/Hy3-preview](https://huggingface.co/tencent/Hy3-preview) — BF16 instruct
* [tencent/Hy3-preview-Base](https://huggingface.co/tencent/Hy3-preview-Base) — BF16 base

**Recommended Generation Parameters:**

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Parameter</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Value</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`temperature`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>0.7</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`top_p`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>0.9</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`reasoning_effort`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`high` / `medium` / `low` (thinking) or `none` (instant)</td>
    </tr>
  </tbody>
</table>

**License:** TODO — verify on HuggingFace model card.

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

**Docker Images by Hardware Platform:**

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Hardware Platform</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Docker Image</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>NVIDIA H200 / B200</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`lmsysorg/sglang:hy3-preview`</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>NVIDIA B300 / GB300</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>`lmsysorg/sglang:hy3-preview-cu130`</td>
    </tr>
  </tbody>
</table>

The `hy3-preview` tag bundles the HYV3 model code, the `hunyuan` tool-call / reasoning parsers, and the MTP draft-module runtime.

For SGLang CPU installation, please refer to the [CPU version installation guide](../../../docs/hardware-platforms/cpu_server#installation).

## 3. Model Deployment

This section provides deployment configurations optimized for different hardware platforms and use cases.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, quantization, and feature capabilities.

<Hunyuan3PreviewDeployment />

### 3.2 Configuration Tips

**Key Parameters:**

<table style={{width: "100%", borderCollapse: "collapse", tableLayout: "fixed"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Parameter</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.05)"}}>Description</th>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, whiteSpace: "nowrap", backgroundColor: "rgba(255,255,255,0.02)"}}>Recommended Value</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--tool-call-parser`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Tool call parser for function-calling support</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>`hunyuan`</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--reasoning-parser`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Reasoning parser for hybrid thinking modes</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>`hunyuan`</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--trust-remote-code`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Required for Hunyuan model loading</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>Always enabled</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--mem-fraction-static`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Static memory fraction (KV + activations)</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>`0.9`</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--tp`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Tensor parallelism size</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>`2` / `4` / `8` depending on hardware</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--attention-backend`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Attention backend (Blackwell only)</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>`trtllm_mha`</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--speculative-algorithm`</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Speculative decoding via the bundled MTP draft</td>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>`EAGLE` + `--speculative-num-steps 3 --speculative-eagle-topk 1 --speculative-num-draft-tokens 4`</td>
    </tr>
  </tbody>
</table>

**Hardware Requirements: NVIDIA BF16 (`Hy3-preview`, \~552GB weights)**

* **H200 (141GB) / B200 (180GB)**: TP=8 (minimum for BF16 to fit single-node).
* **B300 (275GB) / GB300**: TP=4.
* **A100 / H100 (80GB)**: not supported single-node — BF16 requires multi-node TP=16+ on 80GB-class GPUs.

**Blackwell (B200 / B300 / GB300):** Auto-selected attention backend can mis-route for HYV3 on Blackwell. Always pass `--attention-backend trtllm_mha` explicitly on Blackwell hardware (the config generator above enforces this).

**Multi-Token Prediction (MTP):** The `Hy3-preview` release bundles an MTP draft module. SGLang runs it via its EAGLE speculative-decoding path — the draft module auto-loads from the same `--model-path`. Enable with the standard MTP flags:

```bash Command theme={null}
sglang serve \
  --model-path tencent/Hy3-preview \
  --tp 8 \
  --speculative-algorithm EAGLE \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --reasoning-parser hunyuan \
  --tool-call-parser hunyuan \
  --trust-remote-code \
  --mem-fraction-static 0.85
```

Toggle the "Speculative Decoding (MTP)" option in the generator above to add these flags automatically. Tune `num-steps` / `num-draft-tokens` based on acceptance rate in your workload.

For configuring CPU service, please refer to the `Notes` part in the serving engine launching section in [the SGLang CPU server document](../../../docs/hardware-platforms/cpu_server#launch-of-the-serving-engine) to better understand how to configure the arguments, especially for TP (tensor parallel) and NUMA binding settings.

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)

**Deployment Command (H200 × 8, BF16 default):**

```bash Command theme={null}
sglang serve \
  --model-path tencent/Hy3-preview \
  --tp 8 \
  --reasoning-parser hunyuan \
  --tool-call-parser hunyuan \
  --trust-remote-code \
  --mem-fraction-static 0.9
```

**Testing Deployment:**

After startup, you can test the SGLang OpenAI-compatible API with the following command:

```bash Command theme={null}
curl http://localhost:30000/v1/chat/completions \
    -H "Content-Type: application/json" \
    -d '{
        "model": "tencent/Hy3-preview",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Who won the world series in 2020?"}
        ]
    }'
```

**Simple Completion Example:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

response = client.chat.completions.create(
    model="tencent/Hy3-preview",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Who won the world series in 2020?"}
    ],
    max_tokens=1024
)

print("Reasoning:", response.choices[0].message.reasoning_content)
print("Content:  ", response.choices[0].message.content)
```

**Output Example:**

```text Output theme={null}
Reasoning: None
Content:   The Los Angeles Dodgers won the 2020 World Series. They defeated the Tampa Bay Rays in six games (4-2). This was the Dodgers' first World Series championship since 1988. The series was notable for being played in a neutral-site bubble at Globe Life Field in Arlington, Texas, due to the COVID-19 pandemic.
```

When `reasoning_effort` is not set, the server defaults to instant mode (no thinking, `reasoning_content=None`). To opt into thinking, pass `reasoning_effort="high" / "medium" / "low"` on the request — see the Hybrid Thinking section below.

### 4.2 Advanced Usage

#### 4.2.1 Reasoning Parser (Hybrid Thinking)

Hy3-preview is a hybrid-thinking model. Control the thinking budget via the OpenAI-standard `reasoning_effort`:

* `high` / `medium` / `low` — increasing amounts of chain-of-thought in `reasoning_content`
* `none` — skip thinking entirely (instant responses, content-only)

Enable the reasoning parser during deployment so that the thinking section (`<think>...</think>`) is separated into `reasoning_content`:

```bash Command theme={null}
sglang serve \
  --model-path tencent/Hy3-preview \
  --tp 8 \
  --reasoning-parser hunyuan \
  --trust-remote-code \
  --mem-fraction-static 0.9
```

**Thinking Mode — High Effort:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

response = client.chat.completions.create(
    model="tencent/Hy3-preview",
    messages=[{"role": "user", "content": "Solve step by step: What is 15% of 240?"}],
    reasoning_effort="high",
    max_tokens=2048,
)

msg = response.choices[0].message
print("=============== Thinking =================")
print(msg.reasoning_content)
print("=============== Content =================")
print(msg.content)
```

**Output Example:**

```text Output theme={null}
=============== Thinking =================
We need to solve: "What is 15% of 240?" Step by step. So we need to compute 15% of 240. The process: 15% means 15 per hundred, i.e., 15/100 = 0.15. Multiply 0.15 by 240. Or we can do: 10% of 240 = 24, 5% is half of 10% = 12, so sum = 36. Or do multiplication: 15/100 * 240 = (15*240)/100 = (3600)/100 = 36. So answer is 36.

We need to produce step-by-step explanation. The instruction: "Solve step by step: What is 15% of 240?" So we should provide a clear solution with steps. The final answer: 36. Also maybe include units? No units.

We'll output the solution in a clear manner.
=============== Content =================
To find 15% of 240, follow these steps:

1. **Understand that percent means "per hundred."**
   So, 15% = 15/100 or 0.15.

2. **Multiply the number (240) by the percentage in decimal form.**
   0.15 × 240.

   Alternatively, you can use fractions:
   (15/100) × 240.

3. **Perform the multiplication.**
   0.15 × 240 = 36.
   Or:
   (15 × 240) / 100 = 3600 / 100 = 36.

4. **Check using an alternative method:**
   - 10% of 240 = 24.
   - 5% of 240 = half of 10% = 12.
   - 15% = 10% + 5% = 24 + 12 = 36.

Thus, **15% of 240 is 36**.
```

**Instant Mode — No Thinking:**

```python Example theme={null}
response = client.chat.completions.create(
    model="tencent/Hy3-preview",
    messages=[{"role": "user", "content": "Give me a one-line summary of relativity."}],
    reasoning_effort="none",
    max_tokens=256,
)

print("Content:", response.choices[0].message.content)
```

**Output Example:**

```text Output theme={null}
Content: Relativity is Einstein's theory that space, time, mass, and gravity are interconnected and relative, not fixed, fundamentally changing our understanding of the universe.
```

#### 4.2.2 Tool Calling

Hy3-preview supports streaming OpenAI-compatible tool calls. Enable both parsers together — the reasoning parser strips thinking tokens before the tool-call parser runs:

```bash Command theme={null}
sglang serve \
  --model-path tencent/Hy3-preview \
  --tp 8 \
  --reasoning-parser hunyuan \
  --tool-call-parser hunyuan \
  --trust-remote-code \
  --mem-fraction-static 0.9
```

**Non-Streaming Example:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a city.",
            "parameters": {
                "type": "object",
                "properties": {
                    "city": {"type": "string"},
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["city"],
            },
        },
    }
]

response = client.chat.completions.create(
    model="tencent/Hy3-preview",
    messages=[{"role": "user", "content": "What's the weather in Beijing? Use fahrenheit."}],
    tools=tools,
)

msg = response.choices[0].message
print("Reasoning:", msg.reasoning_content)
print("Content:  ", msg.content)
for tc in msg.tool_calls or []:
    print(f"Tool Call: {tc.function.name}")
    print(f"  Arguments: {tc.function.arguments}")
```

**Output Example:**

```text Output theme={null}
Reasoning: None
Content:   I'll get the current weather for Beijing in Fahrenheit for you.
Tool Call: get_weather
  Arguments: {"city": "Beijing", "unit": "fahrenheit"}
```

**Streaming Example (incremental argument deltas):**

Hy3-preview's `hunyuan` tool-call parser emits tool names first, then argument JSON in incremental fragments — matching the OpenAI streaming contract:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(base_url="http://localhost:30000/v1", api_key="EMPTY")

stream = client.chat.completions.create(
    model="tencent/Hy3-preview",
    messages=[{"role": "user", "content": "What's the weather in Beijing? Use fahrenheit."}],
    tools=tools,
    stream=True,
)

tool_buffer = {}
for chunk in stream:
    delta = chunk.choices[0].delta
    if delta.content:
        print(delta.content, end="", flush=True)
    for tc in delta.tool_calls or []:
        buf = tool_buffer.setdefault(tc.index, {"name": "", "args": ""})
        if tc.function and tc.function.name:
            buf["name"] += tc.function.name
        if tc.function and tc.function.arguments:
            buf["args"] += tc.function.arguments

for idx, buf in tool_buffer.items():
    print(f"\nTool[{idx}] {buf['name']}({buf['args']})")
```

**Output Example:**

```text Output theme={null}
I'll check the current weather in Beijing for you using Fahrenheit.
Tool[0] get_weather({"city": "Beijing", "unit": "fahrenheit"})
```

## 5. Benchmark

### 5.1 Accuracy Benchmark

**Test Environment:**

* Hardware: 8× NVIDIA H200 (141GB)
* Docker Image: `lmsysorg/sglang:hy3-preview`
* Model: `tencent/Hy3-preview` (BF16)
* Tensor Parallelism: 8
* SGLang version: latest `main`

#### 5.1.1 GSM8K

* Benchmark Method: 5-shot CoT on 200 questions, evaluated via SGLang native backend
* Benchmark Command:

```bash Command theme={null}
python3 benchmark/gsm8k/bench_sglang.py --num-questions 200 --parallel 64
```

* Test Results:

```text Output theme={null}
TODO — replace with real GSM8K accuracy after benchmark run on Hy3-preview (BF16).
```

#### 5.1.2 MMLU

* Benchmark Method: 5-shot, all 57 subjects
* Benchmark Command:

```bash Command theme={null}
python3 benchmark/mmlu/bench_sglang.py --nsub 60 --parallel 64
```

* Test Results:

```text Output theme={null}
TODO — replace with real MMLU accuracy after benchmark run on Hy3-preview (BF16).
```

#### 5.1.3 Tool-Call Accuracy (MiniMax-Provider-Verifier)

* Benchmark Tool: [MiniMax-Provider-Verifier](https://github.com/MiniMax-AI/MiniMax-Provider-Verifier)
* Metric: function-call schema validity, argument match, and end-to-end response correctness
* Test Results:

```text Output theme={null}
TODO — replace with real tool-call accuracy after benchmark run on Hy3-preview (BF16).
```

### 5.2 Speed Benchmark

#### 5.2.1 Low Concurrency

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model tencent/Hy3-preview \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1
```

* Test Results:

```text Output theme={null}
TODO — replace with real low-concurrency output on Hy3-preview (BF16).
```

#### 5.2.2 High Concurrency

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model tencent/Hy3-preview \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 500 \
  --max-concurrency 100
```

* Test Results:

```text Output theme={null}
TODO — replace with real high-concurrency output on Hy3-preview (BF16).
```