> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Llama 4

export const Llama4MaverickDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'b200',
        label: 'B200',
        default: false
      }, {
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'mi300x',
        label: 'MI300x',
        default: true
      }, {
        id: 'mi325x',
        label: 'MI325x',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355x',
        default: false
      }, {
        id: 'xeon',
        label: 'XEON',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      getDynamicItems: values => [{
        id: 'bf16',
        label: 'BF16',
        default: true
      }, {
        id: 'fp8',
        label: 'FP8',
        default: false,
        disabled: values.hardware === 'xeon'
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    },
    speculative: {
      name: 'speculative',
      title: 'Speculative Decoding (EAGLE3)',
      condition: values => values.hardware !== 'xeon',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enable EAGLE3',
        default: false
      }]
    },
    host: {
      name: 'host',
      title: 'Host',
      type: 'text',
      default: '0.0.0.0',
      placeholder: '0.0.0.0'
    },
    port: {
      name: 'port',
      title: 'Port',
      type: 'text',
      default: '8000',
      placeholder: '8000'
    }
  };
  const generateCommand = values => {
    const {hardware, quantization, toolcall, speculative, host, port} = values;
    let cmd = 'python -m sglang.launch_server \\\n';
    cmd += `  --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct`;
    if (hardware === 'h200') {
      cmd += ` \\\n  --tp 8`;
    } else if (hardware === 'b200') {
      cmd += ` \\\n  --tp 8`;
    } else if (hardware === 'mi300x' || hardware === 'mi325x' || hardware === 'mi355x') {
      cmd += ` \\\n  --tp 8`;
    } else if (hardware === 'xeon') {
      cmd += ` \\\n  --device cpu \\\n  --disable-overlap-schedule \\\n  --tp 6`;
    }
    if (quantization === 'fp8' && hardware !== 'xeon') {
      cmd += ` \\\n  --quantization fp8`;
    }
    if (toolcall === 'enabled') {
      cmd += ` \\\n  --tool-call-parser pythonic`;
    }
    if (speculative === 'enabled' && hardware !== 'xeon') {
      cmd += ` \\\n  --speculative-algorithm EAGLE3 \\\n`;
      cmd += `  --speculative-draft-model-path lmsys/sglang-EAGLE3-Llama-4-Maverick-17B-128E-Instruct-v1 \\\n`;
      cmd += `  --speculative-num-steps 3 \\\n`;
      cmd += `  --speculative-eagle-topk 1 \\\n`;
      cmd += `  --speculative-num-draft-tokens 4 \\\n`;
      cmd += `  --mem-fraction-static 0.75 \\\n`;
      cmd += `  --cuda-graph-max-bs-decode 2`;
    }
    cmd += ` \\\n  --enable-multimodal`;
    cmd += ` \\\n  --context-length 65536`;
    cmd += ` \\\n  --dtype bfloat16`;
    cmd += ` \\\n  --trust-remote-code`;
    cmd += ` \\\n  --host ${host || '0.0.0.0'}`;
    cmd += ` \\\n  --port ${port || '8000'}`;
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      let items = option.items || [];
      if (option.getDynamicItems) {
        const defaultValues = {};
        Object.entries(options).forEach(([innerKey, innerOption]) => {
          if (innerOption.type === 'checkbox') {
            defaultValues[innerKey] = (innerOption.items || []).filter(item => item.default).map(item => item.id);
          } else if (innerOption.type === 'text') {
            defaultValues[innerKey] = innerOption.default || '';
          } else if (innerOption.items && innerOption.items.length > 0) {
            const defaultItem = innerOption.items.find(item => item.default);
            defaultValues[innerKey] = defaultItem ? defaultItem.id : innerOption.items[0].id;
          }
        });
        items = option.getDynamicItems(defaultValues);
      }
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => {
      const next = {
        ...prev,
        [optionName]: value
      };
      if (optionName === 'hardware' && value === 'xeon') {
        next.quantization = 'bf16';
        next.speculative = 'disabled';
      }
      return next;
    });
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

export const Llama4ScoutDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'b200',
        label: 'B200',
        default: false
      }, {
        id: 'h100',
        label: 'H100',
        default: true
      }, {
        id: 'h200',
        label: 'H200',
        default: false
      }, {
        id: 'mi300x',
        label: 'MI300x',
        default: false
      }, {
        id: 'mi325x',
        label: 'MI325x',
        default: false
      }, {
        id: 'mi355x',
        label: 'MI355x',
        default: false
      }, {
        id: 'xeon',
        label: 'XEON',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      getDynamicItems: values => [{
        id: 'bf16',
        label: 'BF16',
        default: true
      }, {
        id: 'fp8',
        label: 'FP8',
        default: false,
        disabled: values.hardware === 'xeon'
      }]
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }]
    },
    speculative: {
      name: 'speculative',
      title: 'Speculative Decoding (EAGLE3)',
      condition: values => values.hardware !== 'xeon',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enable EAGLE3',
        default: false
      }]
    },
    host: {
      name: 'host',
      title: 'Host',
      type: 'text',
      default: '0.0.0.0',
      placeholder: '0.0.0.0'
    },
    port: {
      name: 'port',
      title: 'Port',
      type: 'text',
      default: '8000',
      placeholder: '8000'
    }
  };
  const generateCommand = values => {
    const {hardware, quantization, toolcall, speculative, host, port} = values;
    let cmd = 'python -m sglang.launch_server \\\n';
    cmd += `  --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct`;
    if (hardware === 'h100' || hardware === 'h200') {
      cmd += ` \\\n  --tp 8`;
    } else if (hardware === 'b200') {
      cmd += ` \\\n  --tp 8`;
    } else if (hardware === 'mi300x' || hardware === 'mi325x' || hardware === 'mi355x') {
      cmd += ` \\\n  --tp 8`;
    } else if (hardware === 'xeon') {
      cmd += ` \\\n  --device cpu \\\n  --disable-overlap-schedule \\\n  --tp 6`;
    }
    if (quantization === 'fp8' && hardware !== 'xeon') {
      cmd += ` \\\n  --quantization fp8`;
    }
    if (toolcall === 'enabled') {
      cmd += ` \\\n  --tool-call-parser pythonic`;
    }
    if (speculative === 'enabled' && hardware !== 'xeon') {
      cmd += ` \\\n  --speculative-algorithm EAGLE3 \\\n`;
      cmd += `  --speculative-draft-model-path lmsys/sglang-EAGLE3-Llama-4-Scout-17B-16E-Instruct-v1 \\\n`;
      cmd += `  --speculative-num-steps 3 \\\n`;
      cmd += `  --speculative-eagle-topk 1 \\\n`;
      cmd += `  --speculative-num-draft-tokens 4 \\\n`;
      cmd += `  --mem-fraction-static 0.75 \\\n`;
      cmd += `  --cuda-graph-max-bs-decode 2`;
    }
    cmd += ` \\\n  --enable-multimodal`;
    cmd += ` \\\n  --context-length 65536`;
    cmd += ` \\\n  --dtype bfloat16`;
    cmd += ` \\\n  --trust-remote-code`;
    cmd += ` \\\n  --host ${host || '0.0.0.0'}`;
    cmd += ` \\\n  --port ${port || '8000'}`;
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      let items = option.items || [];
      if (option.getDynamicItems) {
        const defaultValues = {};
        Object.entries(options).forEach(([innerKey, innerOption]) => {
          if (innerOption.type === 'checkbox') {
            defaultValues[innerKey] = (innerOption.items || []).filter(item => item.default).map(item => item.id);
          } else if (innerOption.type === 'text') {
            defaultValues[innerKey] = innerOption.default || '';
          } else if (innerOption.items && innerOption.items.length > 0) {
            const defaultItem = innerOption.items.find(item => item.default);
            defaultValues[innerKey] = defaultItem ? defaultItem.id : innerOption.items[0].id;
          }
        });
        items = option.getDynamicItems(defaultValues);
      }
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => {
      const next = {
        ...prev,
        [optionName]: value
      };
      if (optionName === 'hardware' && value === 'xeon') {
        next.quantization = 'bf16';
        next.speculative = 'disabled';
      }
      return next;
    });
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[Llama 4](https://github.com/meta-llama/llama-models/blob/main/models/llama4/MODEL_CARD.md) is Meta's latest generation of open-source LLM model with industry-leading performance.

SGLang has supported Llama 4 Scout (109B) and Llama 4 Maverick (400B) since [v0.4.5](https://github.com/sgl-project/sglang/releases/tag/v0.4.5).

Ongoing optimizations are tracked in the [Roadmap](https://github.com/sgl-project/sglang/issues/5118).

This generation delivers comprehensive upgrades across the board:

The highly capable Llama 4 Maverick with 17B active parameters out of \~400B total, with 128 experts.
The efficient Llama 4 Scout also has 17B active parameters out of \~109B total, using just 16 experts.
Both models leverage early fusion for native multimodality, enabling them to process text and image inputs. Maverick and Scout are both trained on up to 40 trillion tokens on data encompassing 200 languages (with specific fine-tuning support for 12 languages including Arabic, Spanish, German, and Hindi).

For more details, please refer to the official llama4 Repository:[https://www.llama.com/models/llama-4/](https://www.llama.com/models/llama-4/)

## 2. SGLang Installation

SGLang offers multiple installation methods. You can choose the most suitable installation method based on your hardware platform and requirements.

Please refer to the [official SGLang installation guide](../../../docs/get-started/install) for installation instructions.

For SGLang CPU installation, please refer to the [CPU version installation guide](../../../docs/hardware-platforms/cpu_server#installation).

## 3. Model Deployment

This section provides a progressive guide from quick deployment to performance optimization, suitable for users at different levels.

### 3.1 Basic Configuration

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, model variant, deployment strategy, and thinking capabilities.

<Llama4ScoutDeployment />

<Llama4MaverickDeployment />

### 3.2 Configuration Tips

* **OOM Mitigation:** Reduce `--context-length` to avoid GPU out-of-memory. Recommended: Scout up to 1M on 8×H100, up to 2.5M on 8×H200; Maverick doesn't need context-length set on 8×H200. With hybrid KV cache enabled, Scout can reach 5M on 8×H100 and 10M on 8×H200.
* **Attention Backend Auto-Selection:** SGLang automatically picks the optimal backend. Manual override with `--attention-backend`:
  * Blackwell (B200/GB200): `trtllm_mha`
  * Hopper (H100/H200): `fa3`
  * AMD GPUs: `aiter`
  * Intel XPU: `intel_xpu`
  * Other: `triton`
* **Chat Template:** Add `--chat-template llama-4` for chat completion tasks.
* **Multi-Modal:** Add `--enable-multimodal` to enable image input support.
* **Hybrid KV Cache:** Set `--swa-full-tokens-ratio` to control the ratio of SWA (local attention) KV tokens to full-attention KV tokens (default: 0.8, range: 0–1).
* **EAGLE Speculative Decoding:** Supported for Llama 4 Scout and Maverick via EAGLE3. Enable with the interactive command generator above.
* **Xeon CPU service configuration:** Please refer to the `Notes` part in the serving engine launching section in [the SGLang CPU server document](../../../docs/hardware-platforms/cpu_server#launch-of-the-serving-engine) to better understand how to configure the arguments, especially for TP (tensor parallel) and NUMA binding settings.

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)
* [SGLang OpenAI Vision API Guide](../../../docs/basic_usage/openai_api_vision)

### 4.2 Advanced Usage

#### 4.2.1 Launch the docker

```shell Command theme={null}
docker pull lmsysorg/sglang:v0.5.9-rocm720-mi30x
```

```shell Command theme={null}
docker run -d -it --ipc=host --network=host --privileged \
  --cap-add=CAP_SYS_ADMIN \
  --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
  --group-add video --cap-add=SYS_PTRACE \
  --security-opt seccomp=unconfined \
  -v /:/work \
  -e SHELL=/bin/bash \
  --name Llama4 \
  lmsysorg/sglang:v0.5.9-rocm720-mi30x \
  /bin/bash
```

#### 4.2.2 Launch the server

### Llama-4-Scout

8-GPU deployment command:

```bash Command theme={null}
sglang serve \
  --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \
  --tp 8 \
  --context-length 1000000 \
  --trust-remote-code
```

### Llama-4-Maverick

8-GPU deployment command:

```bash Command theme={null}
sglang serve \
  --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct \
  --tp 8 \
  --trust-remote-code
```

#### 4.2.3 EAGLE Speculative Decoding

SGLang supports Llama 4 Maverick (400B) with [EAGLE speculative decoding](../../../docs/advanced_features/speculative_decoding). Enable with the EAGLE3 algorithm and the SGLang EAGLE3 draft model:

```shell Command theme={null}
python3 -m sglang.launch_server \
  --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct \
  --speculative-algorithm EAGLE3 \
  --speculative-draft-model-path lmsys/sglang-EAGLE3-Llama-4-Maverick-17B-128E-Instruct-v1 \
  --speculative-num-steps 3 \
  --speculative-eagle-topk 1 \
  --speculative-num-draft-tokens 4 \
  --trust-remote-code \
  --tp 8
```

## 5. Benchmark

### 5.1 Speed Benchmark (Scout)

Test Environment:

Hardware: AMD MI300x GPU

Model: Llama-4-Scout

Tensor Parallelism: 8

sglang version: 0.5.9

* **Model Deployment**

```bash Command theme={null}
sglang serve \
  --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \
  --tp 8 \
  --context-length 1000000 \
  --trust-remote-code
```

### 5.1.1 Low Concurrency (Latency-Optimized)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model meta-llama/Llama-4-Scout-17B-16E-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  74.62
Total input tokens:                      6101
Total input text tokens:                 6101
Total input vision tokens:               0
Total generated tokens:                  4220
Total generated tokens (retokenized):    4211
Request throughput (req/s):              0.14
Input token throughput (tok/s):          82.88
Output token throughput (tok/s):         57.42
Peak output token throughput (tok/s):    146.00
Peak concurrent requests:                2
Total token throughput (tok/s):          140.20
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   7459.48
Median E2E Latency (ms):                 4489.77
---------------Time to First Token----------------
Mean TTFT (ms):                          4246.98
Median TTFT (ms):                        68.57
P99 TTFT (ms):                           48091.05
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          7.49
Median TPOT (ms):                        7.40
P99 TPOT (ms):                           7.40
---------------Inter-Token Latency----------------
Mean ITL (ms):                           7.49
Median ITL (ms):                         7.49
P95 ITL (ms):                            7.47
P99 ITL (ms):                            7.52
Max ITL (ms):                            10.44
==================================================
```

### 5.1.2 Medium Concurrency (Balanced)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model meta-llama/Llama-4-Scout-17B-16E-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  45.41
Total input tokens:                      49668
Total input text tokens:                 49668
Total input vision tokens:               0
Total generated tokens:                  40805
Total generated tokens (retokenized):    40516
Request throughput (req/s):              2.26
Input token throughput (tok/s):          1120.46
Output token throughput (tok/s):         1152.47
Peak output token throughput (tok/s):    1520.00
Peak concurrent requests:                21
Total token throughput (tok/s):          2272.84
Concurrency:                             14.76
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   6089.22
Median E2E Latency (ms):                 6568.80
---------------Time to First Token----------------
Mean TTFT (ms):                          124.44
Median TTFT (ms):                        87.42
P99 TTFT (ms):                           268.72
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          11.88
Median TPOT (ms):                        12.00
P99 TPOT (ms):                           15.49
---------------Inter-Token Latency----------------
Mean ITL (ms):                           11.72
Median ITL (ms):                         10.54
P95 ITL (ms):                            11.22
P99 ITL (ms):                            67.88
Max ITL (ms):                            74.05
==================================================
```

### 5.1.3 High Concurrency (Throughput-Optimized)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model meta-llama/Llama-4-Scout-17B-16E-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 500 \
  --max-concurrency 100 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     500
Benchmark duration (s):                  85.84
Total input tokens:                      249841
Total input text tokens:                 249841
Total input vision tokens:               0
Total generated tokens:                  252662
Total generated tokens (retokenized):    250498
Request throughput (req/s):              5.84
Input token throughput (tok/s):          2910.84
Output token throughput (tok/s):         2944.82
Peak output token throughput (tok/s):    4100.00
Peak concurrent requests:                110
Total token throughput (tok/s):          5854.65
Concurrency:                             92.24
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   15844.00
Median E2E Latency (ms):                 15262.56
---------------Time to First Token----------------
Mean TTFT (ms):                          204.46
Median TTFT (ms):                        129.96
P99 TTFT (ms):                           528.54
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          41.56
Median TPOT (ms):                        42.90
P99 TPOT (ms):                           47.48
---------------Inter-Token Latency----------------
Mean ITL (ms):                           40.99
Median ITL (ms):                         24.46
P95 ITL (ms):                            84.46
P99 ITL (ms):                            87.64
Max ITL (ms):                            226.06
==================================================
```

### 5.2 Speed Benchmark (Maverick)

Test Environment:

Hardware: AMD MI300x GPU

Model: Llama-4-Maverick

Tensor Parallelism: 8

sglang version: 0.5.9

* **Model Deployment**

```bash Command theme={null}
sglang serve \
  --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct \
  --tp 8 \
  --context-length 1000000 \
  --trust-remote-code
```

### 5.2.1 Low Concurrency (Latency-Optimized)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model meta-llama/Llama-4-Maverick-17B-128E-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 10 \
  --max-concurrency 1 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 1
Successful requests:                     10
Benchmark duration (s):                  68.08
Total input tokens:                      6101
Total input text tokens:                 6101
Total input vision tokens:               0
Total generated tokens:                  4220
Total generated tokens (retokenized):    4202
Request throughput (req/s):              0.15
Input token throughput (tok/s):          89.62
Output token throughput (tok/s):         61.99
Peak output token throughput (tok/s):    168.00
Peak concurrent requests:                2
Total token throughput (tok/s):          151.61
Concurrency:                             1.00
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   6805.62
Median E2E Latency (ms):                 2733.91
---------------Time to First Token----------------
Mean TTFT (ms):                          4296.56
Median TTFT (ms):                        57.45
P99 TTFT (ms):                           38633.95
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          5.95
Median TPOT (ms):                        5.96
P99 TPOT (ms):                           5.97
---------------Inter-Token Latency----------------
Mean ITL (ms):                           5.96
Median ITL (ms):                         5.96
P95 ITL (ms):                            6.02
P99 ITL (ms):                            6.08
Max ITL (ms):                            7.02
==================================================
```

### 5.2.2 Medium Concurrency (Balanced)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model meta-llama/Llama-4-Maverick-17B-128E-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 80 \
  --max-concurrency 16 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 16
Successful requests:                     80
Benchmark duration (s):                  30.72
Total input tokens:                      39668
Total input text tokens:                 39668
Total input vision tokens:               0
Total generated tokens:                  40805
Total generated tokens (retokenized):    40923
Request throughput (req/s):              2.60
Input token throughput (tok/s):          1291.39
Output token throughput (tok/s):         1328.41
Peak output token throughput (tok/s):    1760.00
Peak concurrent requests:                22
Total token throughput (tok/s):          2619.80
Concurrency:                             13.92
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   5345.15
Median E2E Latency (ms):                 5679.73
---------------Time to First Token----------------
Mean TTFT (ms):                          259.30
Median TTFT (ms):                        72.60
P99 TTFT (ms):                           1063.45
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          10.53
Median TPOT (ms):                        10.22
P99 TPOT (ms):                           20.27
---------------Inter-Token Latency----------------
Mean ITL (ms):                           9.99
Median ITL (ms):                         9.10
P95 ITL (ms):                            9.87
P99 ITL (ms):                            55.62
Max ITL (ms):                            868.54
==================================================
```

### 5.2.3 High Concurrency (Throughput-Optimized)

* Benchmark Command:

```bash Command theme={null}
python3 -m sglang.bench_serving \
  --backend sglang \
  --model meta-llama/Llama-4-Maverick-17B-128E-Instruct \
  --dataset-name random \
  --random-input-len 1000 \
  --random-output-len 1000 \
  --num-prompts 500 \
  --max-concurrency 100 \
  --request-rate inf
```

* Test Results:

```text Output theme={null}
============ Serving Benchmark Result ============
Backend:                                 sglang
Traffic request rate:                    inf
Max request concurrency:                 100
Successful requests:                     500
Benchmark duration (s):                  90.95
Total input tokens:                      249831
Total input text tokens:                 249831
Total input vision tokens:               0
Total generated tokens:                  252662
Total generated tokens (retokenized):    251625
Request throughput (req/s):              5.50
Input token throughput (tok/s):          2746.77
Output token throughput (tok/s):         2777.90
Peak output token throughput (tok/s):    3700.00
Peak concurrent requests:                109
Total token throughput (tok/s):          5524.67
Concurrency:                             93.04
----------------End-to-End Latency----------------
Mean E2E Latency (ms):                   16924.17
Median E2E Latency (ms):                 16294.85
---------------Time to First Token----------------
Mean TTFT (ms):                          188.19
Median TTFT (ms):                        128.96
P99 TTFT (ms):                           534.81
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms):                          33.63
Median TPOT (ms):                        35.37
P99 TPOT (ms):                           38.26
---------------Inter-Token Latency----------------
Mean ITL (ms):                           33.19
Median ITL (ms):                         27.66
P95 ITL (ms):                            76.91
P99 ITL (ms):                            78.82
Max ITL (ms):                            268.17
==================================================
```

### 5.3 Accuracy Benchmark

#### 5.3.1 GSM8K Benchmark

* **Benchmark Command:**

```shell Command theme={null}
python3 -m sglang.test.few_shot_gsm8k --num-questions 200
```

* Llama-4-Scout-17B-16E-Instruct

```text Output theme={null}
Accuracy: 0.945
Invalid: 0.000
Latency: 12.731 s
Output throughput: 1595.418 token/s
```

* Llama-4-Maverick-17B-128E-Instruct

```text Output theme={null}
Accuracy: 0.895
Invalid: 0.000
Latency: 9.739 s
Output throughput: 2405.505 token/s
```

#### 5.3.2 MMLU Pro with lm-eval

Accuracy on MMLU Pro matches [Meta's official benchmark numbers](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) on 8×H100 (reproduction details: [PR #5092](https://github.com/sgl-project/sglang/pull/5092)):

<table style={{width: "100%", borderCollapse: "collapse"}}>
  <thead>
    <tr style={{borderBottom: "2px solid #d55816"}}>
      <th style={{textAlign: "left", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.02)"}}>Model</th>
      <th style={{textAlign: "right", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.05)"}}>Official</th>
      <th style={{textAlign: "right", padding: "10px 12px", fontWeight: 700, backgroundColor: "rgba(255,255,255,0.02)"}}>SGLang</th>
    </tr>
  </thead>

  <tbody>
    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>Llama-4-Scout-17B-16E-Instruct</td>
      <td style={{padding: "9px 12px", textAlign: "right", backgroundColor: "rgba(255,255,255,0.05)"}}>74.3</td>
      <td style={{padding: "9px 12px", textAlign: "right", backgroundColor: "rgba(255,255,255,0.02)"}}>75.2</td>
    </tr>

    <tr>
      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}>Llama-4-Maverick-17B-128E-Instruct</td>
      <td style={{padding: "9px 12px", textAlign: "right", backgroundColor: "rgba(255,255,255,0.05)"}}>80.5</td>
      <td style={{padding: "9px 12px", textAlign: "right", backgroundColor: "rgba(255,255,255,0.02)"}}>80.7</td>
    </tr>
  </tbody>
</table>

**Scout:**

```bash Command theme={null}
# Start the server
python -m sglang.launch_server \
  --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \
  --port 30000 \
  --tp 8 \
  --mem-fraction-static 0.8 \
  --context-length 65536

# Run lm_eval
lm_eval --model local-chat-completions \
  --model_args model=meta-llama/Llama-4-Scout-17B-16E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 \
  --tasks mmlu_pro \
  --batch_size 128 \
  --apply_chat_template \
  --num_fewshot 0
```

**Maverick:**

```bash Command theme={null}
# Start the server
python -m sglang.launch_server \
  --model-path meta-llama/Llama-4-Maverick-17B-128E-Instruct \
  --port 30000 \
  --tp 8 \
  --mem-fraction-static 0.8 \
  --context-length 65536

# Run lm_eval
lm_eval --model local-chat-completions \
  --model_args model=meta-llama/Llama-4-Maverick-17B-128E-Instruct,base_url=http://localhost:30000/v1/chat/completions,num_concurrent=128,timeout=999999,max_gen_toks=2048 \
  --tasks mmlu_pro \
  --batch_size 128 \
  --apply_chat_template \
  --num_fewshot 0
```