> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Llama-3.3-70B

export const Llama33Deployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'mi300x',
        label: 'MI300X',
        default: true
      }, {
        id: 'mi325x',
        label: 'MI325X',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355X',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      items: [{
        id: 'bf16',
        label: 'BF16',
        default: true
      }, {
        id: 'fp8',
        label: 'FP8',
        default: false
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Calling',
      items: [{
        id: 'enabled',
        label: 'Enabled',
        default: true
      }, {
        id: 'disabled',
        label: 'Disabled',
        default: false
      }]
    }
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      const defaultItem = option.items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : option.items[0].id;
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const generateCommand = () => {
    const {hardware, quantization, toolcall} = values;
    const modelPath = quantization === 'fp8' ? 'amd/Llama-3.3-70B-Instruct-FP8-KV' : 'meta-llama/Llama-3.3-70B-Instruct';
    let cmd = 'python -m sglang.launch_server \\\n';
    cmd += `  --model-path ${modelPath} \\\n`;
    cmd += `  --tp 1`;
    if (toolcall === 'enabled') {
      cmd += ' \\\n  --tool-call-parser llama3';
    }
    cmd += ' \\\n  --host 0.0.0.0 \\\n';
    cmd += '  --port 30000';
    return cmd;
  };
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => <div key={key} style={cardStyle}>
          <div style={titleStyle}>{option.title}</div>
          <div style={itemsStyle}>
            {option.type === 'checkbox' ? option.items.map(item => {
    const isChecked = (values[option.name] || []).includes(item.id);
    const isDisabled = item.required;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {},
      ...isDisabled ? disabledStyle : {}
    }}>
                    <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={e => handleCheckboxChange(option.name, item.id, e.target.checked)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  }) : option.items.map(item => {
    const isChecked = values[option.name] === item.id;
    return <label key={item.id} style={{
      ...labelBaseStyle,
      ...isChecked ? checkedStyle : {}
    }}>
                    <input type="radio" name={option.name} value={item.id} checked={isChecked} onChange={() => handleRadioChange(option.name, item.id)} style={{
      display: 'none'
    }} />
                    {item.label}
                    {item.subtitle && <small style={{
      ...subtitleStyle,
      color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
    }}>{item.subtitle}</small>}
                  </label>;
  })}
          </div>
        </div>)}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{generateCommand()}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) is Meta's latest 70 billion parameter instruction-tuned language model, featuring improved performance and efficiency over Llama 3.1. With a 128K token context window and enhanced capabilities across reasoning, coding, and multilingual tasks, Llama 3.3 delivers state-of-the-art results while maintaining accessibility for production deployment.

**Key Features:**

* **Enhanced Performance**: Improved instruction following, reasoning, and task completion over Llama 3.1
* **Tool Calling**: Native support for function calling and tool use scenarios
* **Multilingual Support**: Optimized for 8 languages (English, German, French, Italian, Portuguese, Hindi, Spanish, and Thai)
* **Extended Context**: 128K token context window for processing long documents and complex tasks
* **Efficient Deployment**: 70B parameters enable deployment on single GPU with AMD MI300X

**License:**
Llama 3.3 is licensed under the Llama 3.3 Community License. See [LICENSE](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/LICENSE) for details.

For more details, please refer to the [official Llama models repository](https://github.com/meta-llama/llama-models).

## 2. SGLang Installation

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

## 3. Model Deployment

This section provides deployment configurations optimized for AMD GPUs (MI300X, MI325X, MI355X).

### 3.1 Interactive Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your AMD GPU setup.

<Llama33Deployment />

### 3.2 Configuration Tips

**AMD GPU Deployment:**

* All AMD GPUs (MI300X, MI325X, MI355X) support TP=1 for both BF16 and FP8 variants
* **FP8 Model Variant**: Use AMD's optimized `amd/Llama-3.3-70B-Instruct-FP8-KV`
* **Tool Calling**: Enable with `--tool-call-parser llama3` for function calling support
* **Higher Throughput**: Optional TP=2 or TP=4 can be used for increased throughput

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)

### 4.2 Advanced Usage

#### 4.2.1 Tool Calling

Llama 3.3 70B Instruct supports native tool calling. Enable the tool parser during deployment:

```shell Command theme={null}
python -m sglang.launch_server \
  --model-path meta-llama/Llama-3.3-70B-Instruct \
  --tool-call-parser llama3 \
  --tp 1 \
  --host 0.0.0.0 \
  --port 30000
```

**Python Example:**

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Define available tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city name"
                    },
                    "unit": {
                        "type": "string",
                        "enum": ["celsius", "fahrenheit"],
                        "description": "Temperature unit"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

# Make request
response = client.chat.completions.create(
    model="meta-llama/Llama-3.3-70B-Instruct",
    messages=[
        {"role": "user", "content": "What's the weather in Tokyo?"}
    ],
    tools=tools,
    temperature=0.7
)

# Check for tool calls
message = response.choices[0].message
if message.tool_calls:
    tool_call = message.tool_calls[0]
    print(f"Function: {tool_call.function.name}")
    print(f"Arguments: {tool_call.function.arguments}")
```

**Handling Tool Call Results:**

```python Example theme={null}
# After executing the function, send the result back
def get_weather(location, unit="celsius"):
    # Your weather API call here
    return f"The weather in {location} is 22°{unit[0].upper()} and sunny."

# Build conversation with tool result
messages = [
    {"role": "user", "content": "What's the weather in Tokyo?"},
    {
        "role": "assistant",
        "content": None,
        "tool_calls": [{
            "id": "call_123",
            "type": "function",
            "function": {
                "name": "get_weather",
                "arguments": '{"location": "Tokyo", "unit": "celsius"}'
            }
        }]
    },
    {
        "role": "tool",
        "tool_call_id": "call_123",
        "content": get_weather("Tokyo", "celsius")
    }
]

final_response = client.chat.completions.create(
    model="meta-llama/Llama-3.3-70B-Instruct",
    messages=messages,
    temperature=0.7
)

print(final_response.choices[0].message.content)
# Output: "The current weather in Tokyo is 22°C and sunny. A perfect day!"
```

#### 4.2.2 Long Context Processing

Leverage the 128K context window for processing long documents:

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Example with long document
long_document = "..." * 10000  # Your long document here

response = client.chat.completions.create(
    model="meta-llama/Llama-3.3-70B-Instruct",
    messages=[
        {"role": "user", "content": f"Summarize this document:\n\n{long_document}"}
    ],
    temperature=0.7,
    max_tokens=1000
)

print(response.choices[0].message.content)
```

## 5. Benchmarking

Use the SGLang benchmarking suite to test model performance with different workload patterns:

### 5.1 Basic Benchmark Command

```bash Command theme={null}
python -m sglang.bench_serving \
  --backend sglang \
  --dataset-name random \
  --num-prompts 1000 \
  --random-input 1024 \
  --random-output 1024 \
  --max-concurrency 16
```

### 5.2 Adjusting Benchmark Parameters

**Input/Output Length**: Adjust `--random-input` and `--random-output` to test different workload patterns:

* Short conversations: `--random-input 1024 --random-output 1024`
* Long outputs: `--random-input 1024 --random-output 8192`
* Long inputs: `--random-input 8192 --random-output 1024`

**Concurrency Levels**: Adjust `--max-concurrency` to test different load scenarios:

* Low concurrency (latency-focused): `--max-concurrency 1 --num-prompts 100`
* Medium concurrency (balanced): `--max-concurrency 16 --num-prompts 1000`
* High concurrency (throughput-focused): `--max-concurrency 100 --num-prompts 2000`

***

## 📚 Additional Resources

* [Meta Llama Models Repository](https://github.com/meta-llama/llama-models)
* [Llama 3.3 Model Card](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)
* [SGLang Documentation](/)
* [AMD ROCm Documentation](https://rocm.docs.amd.com/)
