> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sglang.io/llms.txt
> Use this file to discover all available pages before exploring further.

# Step-3.7-Flash (new)

export const Step37FlashDeployment = () => {
  const options = {
    hardware: {
      name: 'hardware',
      title: 'Hardware Platform',
      items: [{
        id: 'hopper',
        label: 'Hopper',
        default: true
      }, {
        id: 'b200_b300',
        label: 'B200/B300',
        default: false
      }, {
        id: 'gb200_gb300',
        label: 'GB200/GB300',
        default: false
      }]
    },
    quantization: {
      name: 'quantization',
      title: 'Quantization',
      getDynamicItems: values => {
        const isHopper = values.hardware === 'hopper';
        return [{
          id: 'bf16',
          label: 'BF16',
          default: true
        }, {
          id: 'fp8',
          label: 'FP8',
          default: false
        }, ...isHopper ? [] : [{
          id: 'nvfp4',
          label: 'NVFP4',
          default: false
        }]];
      }
    },
    reasoningParser: {
      name: 'reasoningParser',
      title: 'Reasoning Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--reasoning-parser step3p5' : null
    },
    toolcall: {
      name: 'toolcall',
      title: 'Tool Call Parser',
      items: [{
        id: 'disabled',
        label: 'Disabled',
        default: true
      }, {
        id: 'enabled',
        label: 'Enabled',
        default: false
      }],
      commandRule: value => value === 'enabled' ? '--tool-call-parser step3p5' : null
    },
    speculative: {
      name: 'speculative',
      title: 'Speculative Decoding',
      getDynamicItems: values => {
        const isNVFP4 = values.quantization === 'nvfp4';
        return [{
          id: 'disabled',
          label: 'Disabled',
          default: true
        }, {
          id: 'enabled',
          label: 'Enabled',
          default: false,
          disabled: isNVFP4,
          disabledReason: 'Not supported with NVFP4'
        }];
      },
      commandRule: value => {
        if (value !== 'enabled') return null;
        let cmd = '--speculative-algorithm EAGLE \\\n  --speculative-num-steps 3 \\\n  --speculative-eagle-topk 1 \\\n  --speculative-num-draft-tokens 4 \\\n  --enable-multi-layer-eagle ';
        return cmd;
      }
    }
  };
  const generateCommand = values => {
    const {hardware, quantization} = values;
    const isNVFP4 = quantization === 'nvfp4';
    const quantSuffix = quantization === 'fp8' ? '-FP8' : quantization === 'nvfp4' ? '-NVFP4' : '';
    const modelName = `stepfun-ai/Step-3.7-Flash${quantSuffix}`;
    const tpValue = hardware === 'gb200_gb300' ? 4 : 8;
    let cmd = '';
    cmd += 'sglang serve \\\n';
    cmd += `  --model-path ${modelName}`;
    if (tpValue > 1) {
      cmd += ` \\\n  --tp ${tpValue}`;
    }
    if (quantSuffix === '-FP8' || isNVFP4) {
      cmd += ` \\\n  --ep ${tpValue}`;
    }
    if (isNVFP4) {
      cmd += ' \\\n  --moe-runner-backend flashinfer_trtllm';
      cmd += ' \\\n  --kv-cache-dtype fp8_e4m3';
      cmd += ' \\\n  --quantization modelopt_fp4';
      cmd += ' \\\n  --attention-backend trtllm_mha';
    }
    cmd += ' \\\n  --trust-remote-code';
    for (const [key, option] of Object.entries(options)) {
      if (option.commandRule) {
        const rule = option.commandRule(values[key], values);
        if (rule) {
          cmd += ` \\\n  ${rule}`;
        }
      }
    }
    return cmd;
  };
  const getInitialState = () => {
    const initialState = {};
    Object.entries(options).forEach(([key, option]) => {
      if (option.type === 'checkbox') {
        initialState[key] = (option.items || []).filter(item => item.default).map(item => item.id);
        return;
      }
      if (option.type === 'text') {
        initialState[key] = option.default || '';
        return;
      }
      let items = option.items || [];
      if (option.getDynamicItems) {
        const defaultValues = {};
        Object.entries(options).forEach(([innerKey, innerOption]) => {
          if (innerOption.type === 'checkbox') {
            defaultValues[innerKey] = (innerOption.items || []).filter(item => item.default).map(item => item.id);
          } else if (innerOption.type === 'text') {
            defaultValues[innerKey] = innerOption.default || '';
          } else if (innerOption.items && innerOption.items.length > 0) {
            const defaultItem = innerOption.items.find(item => item.default);
            defaultValues[innerKey] = defaultItem ? defaultItem.id : innerOption.items[0].id;
          }
        });
        items = option.getDynamicItems(defaultValues);
      }
      const defaultItem = items && items.find(item => item.default);
      initialState[key] = defaultItem ? defaultItem.id : items && items[0] ? items[0].id : '';
    });
    return initialState;
  };
  const [values, setValues] = useState(getInitialState);
  const [isDark, setIsDark] = useState(false);
  useEffect(() => {
    const checkDarkMode = () => {
      const html = document.documentElement;
      const isDarkMode = html.classList.contains('dark') || html.getAttribute('data-theme') === 'dark' || html.style.colorScheme === 'dark';
      setIsDark(isDarkMode);
    };
    checkDarkMode();
    const observer = new MutationObserver(checkDarkMode);
    observer.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ['class', 'data-theme', 'style']
    });
    return () => observer.disconnect();
  }, []);
  const handleRadioChange = (optionName, value) => {
    setValues(prev => {
      const next = {
        ...prev,
        [optionName]: value
      };
      if (optionName === 'hardware' && value === 'hopper' && prev.quantization === 'nvfp4') {
        next.quantization = 'bf16';
      }
      if (optionName === 'quantization' && value === 'nvfp4' && prev.speculative === 'enabled') {
        next.speculative = 'disabled';
      }
      return next;
    });
  };
  const handleCheckboxChange = (optionName, itemId, isChecked) => {
    setValues(prev => {
      const currentValues = prev[optionName] || [];
      if (isChecked) {
        return {
          ...prev,
          [optionName]: [...currentValues, itemId]
        };
      }
      return {
        ...prev,
        [optionName]: currentValues.filter(id => id !== itemId)
      };
    });
  };
  const handleTextChange = (optionName, value) => {
    setValues(prev => ({
      ...prev,
      [optionName]: value
    }));
  };
  const command = generateCommand(values);
  const containerStyle = {
    maxWidth: '900px',
    margin: '0 auto',
    display: 'flex',
    flexDirection: 'column',
    gap: '4px'
  };
  const cardStyle = {
    padding: '8px 12px',
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`,
    borderLeft: `3px solid ${isDark ? '#E85D4D' : '#D45D44'}`,
    borderRadius: '4px',
    display: 'flex',
    alignItems: 'center',
    gap: '12px',
    background: isDark ? '#1f2937' : '#fff'
  };
  const titleStyle = {
    fontSize: '13px',
    fontWeight: '600',
    minWidth: '140px',
    flexShrink: 0,
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const itemsStyle = {
    display: 'flex',
    rowGap: '2px',
    columnGap: '6px',
    flexWrap: 'wrap',
    alignItems: 'center',
    flex: 1
  };
  const labelBaseStyle = {
    padding: '4px 10px',
    border: `1px solid ${isDark ? '#9ca3af' : '#d1d5db'}`,
    borderRadius: '3px',
    cursor: 'pointer',
    display: 'inline-flex',
    flexDirection: 'column',
    alignItems: 'center',
    justifyContent: 'center',
    fontWeight: '500',
    fontSize: '13px',
    transition: 'all 0.2s',
    userSelect: 'none',
    minWidth: '45px',
    textAlign: 'center',
    flex: 1,
    background: isDark ? '#374151' : '#fff',
    color: isDark ? '#e5e7eb' : 'inherit'
  };
  const checkedStyle = {
    background: '#D45D44',
    color: 'white',
    borderColor: '#D45D44'
  };
  const disabledStyle = {
    cursor: 'not-allowed',
    opacity: 0.5
  };
  const subtitleStyle = {
    display: 'block',
    fontSize: '9px',
    marginTop: '1px',
    lineHeight: '1.1',
    opacity: 0.7
  };
  const textInputStyle = {
    flex: 1,
    padding: '8px 10px',
    borderRadius: '4px',
    border: `1px solid ${isDark ? '#4b5563' : '#d1d5db'}`,
    background: isDark ? '#111827' : '#fff',
    color: isDark ? '#e5e7eb' : '#111827',
    fontSize: '13px'
  };
  const commandDisplayStyle = {
    flex: 1,
    padding: '12px 16px',
    background: isDark ? '#111827' : '#f5f5f5',
    borderRadius: '6px',
    fontFamily: "'Menlo', 'Monaco', 'Courier New', monospace",
    fontSize: '12px',
    lineHeight: '1.5',
    color: isDark ? '#e5e7eb' : '#374151',
    whiteSpace: 'pre-wrap',
    overflowX: 'auto',
    margin: 0,
    border: `1px solid ${isDark ? '#374151' : '#e5e7eb'}`
  };
  return <div style={containerStyle} className="not-prose">
      {Object.entries(options).map(([key, option]) => {
    if (option.condition && !option.condition(values)) {
      return null;
    }
    const items = option.getDynamicItems ? option.getDynamicItems(values) : option.items || [];
    return <div key={key} style={cardStyle}>
            <div style={titleStyle}>{option.title}</div>
            <div style={itemsStyle}>
              {option.type === 'text' ? <input type="text" value={values[option.name] || ''} placeholder={option.placeholder || ''} onChange={event => handleTextChange(option.name, event.target.value)} style={textInputStyle} /> : option.type === 'checkbox' ? (option.items || []).map(item => {
      const isChecked = (values[option.name] || []).includes(item.id);
      const isDisabled = item.required || typeof item.disabledWhen === 'function' && item.disabledWhen(values);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="checkbox" checked={isChecked} disabled={isDisabled} onChange={event => handleCheckboxChange(option.name, item.id, event.target.checked)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    }) : items.map(item => {
      const isChecked = values[option.name] === item.id;
      const isDisabled = Boolean(item.disabled);
      return <label key={item.id} title={item.disabledReason || ''} style={{
        ...labelBaseStyle,
        ...isChecked ? checkedStyle : {},
        ...isDisabled ? disabledStyle : {}
      }}>
                      <input type="radio" name={option.name} value={item.id} checked={isChecked} disabled={isDisabled} onChange={() => !isDisabled && handleRadioChange(option.name, item.id)} style={{
        display: 'none'
      }} />
                      {item.label}
                      {item.subtitle && <small style={{
        ...subtitleStyle,
        color: isChecked ? 'rgba(255,255,255,0.85)' : 'inherit'
      }}>
                          {item.subtitle}
                        </small>}
                    </label>;
    })}
            </div>
          </div>;
  })}
      <div style={cardStyle}>
        <div style={titleStyle}>Run this Command:</div>
        <pre style={commandDisplayStyle}>{command}</pre>
      </div>
    </div>;
};

## 1. Model Introduction

[Step-3.7-Flash](https://huggingface.co/stepfun-ai/Step-3.7-Flash) is a 198B-parameter Mixture-of-Experts (MoE) vision-language model that combines a 196B-parameter language backbone with a 1.8B-parameter vision encoder for native image understanding. Engineered for high-frequency production workloads, it activates approximately 11B parameters per token and supports a 256k context window with three selectable reasoning levels (low, medium, and high). The model is available in multiple quantization formats (BF16, FP8, NVFP4).

Step-3.7-Flash is built for developers who need to scale agentic workflows that combine perception, search, and reasoning — from parsing massive financial reports in one pass, to running multi-step search loops with cross-source verification, to operating concurrent coding agents in high-throughput pipelines.

## 2. SGLang Installation

Step-3.7-Flash is currently available in SGLang via Docker image install.

### Docker (NVIDIA)

```bash Command theme={null}
# Pull the docker image
docker pull lmsysorg/sglang:dev-step-3.7-flash

# Launch the container
docker run -it --gpus all \
  --shm-size=32g \
  --ipc=host \
  --network=host \
  lmsysorg/sglang:dev-step-3.7-flash bash
```

## 3. Model Deployment

This section provides deployment configurations optimized for different use cases.

### 3.1 Basic Configuration

The Step-3.7-Flash series comes in one size with multiple quantization options. Recommended starting configurations vary depending on hardware.

**Interactive Command Generator**: Use the configuration selector below to automatically generate the appropriate deployment command for your hardware platform, quantization method, and capabilities.

<Step37FlashDeployment />

### 3.2 Configuration Tips

* **Memory**: Requires GPUs with high VRAM capacity. Supported platforms: H200 (4x, TP=4), B200/B300 (4x, TP=4), GB200/GB300 (4x, TP=4).
* **NVFP4 Quantization**: NVFP4 provides the smallest memory footprint. Requires `--quantization modelopt_fp4 --kv-cache-dtype fp8_e4m3 --moe-runner-backend flashinfer_trtllm`.
* **Trust Remote Code**: All Step-3.7-Flash variants require `--trust-remote-code` due to the custom model architecture.

## 4. Model Invocation

### 4.1 Basic Usage

For basic API usage and request examples, please refer to:

* [SGLang Basic Usage Guide](../../../docs/basic_usage/send_request)
* [SGLang OpenAI Vision API Guide](../../../docs/basic_usage/openai_api_vision)

### 4.2 Advanced Usage

#### 4.2.1 Multi-Modal Inputs

Step-3.7-Flash supports image inputs alongside text. Here's a basic example:

```python Example theme={null}
import time
from openai import OpenAI

client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:30000/v1",
    timeout=3600
)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {
                    "url": "https://ofasys-multimodal-wlcb-3-toshanghai.oss-accelerate.aliyuncs.com/wpf272043/keepme/image/receipt.png"
                }
            },
            {
                "type": "text",
                "text": "Read all the text in the image."
            }
        ]
    }
]

start = time.time()
response = client.chat.completions.create(
    model="stepfun-ai/Step-3.7-Flash",
    messages=messages,
    max_tokens=2048,
)
print(f"Response costs: {time.time() - start:.2f}s")
print(f"Generated text: {response.choices[0].message.content}")
```

**Multi-Image Input Example:**

Step-3.7-Flash can process multiple images in a single request for comparison or analysis:

```python Example theme={null}
import time
from openai import OpenAI

client = OpenAI(
    api_key="EMPTY",
    base_url="http://localhost:30000/v1",
    timeout=3600
)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {
                    "url": "https://www.civitatis.com/f/china/hong-kong/guia/taxi.jpg"
                }
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": "https://cdn.cheapoguides.com/wp-content/uploads/sites/7/2025/05/GettyImages-509614603-1280x600.jpg"
                }
            },
            {
                "type": "text",
                "text": "Compare these two images and describe the differences in 100 words or less."
            }
        ]
    }
]

start = time.time()
response = client.chat.completions.create(
    model="stepfun-ai/Step-3.7-Flash",
    messages=messages,
    max_tokens=2048,
)
print(f"Response costs: {time.time() - start:.2f}s")
print(f"Generated text: {response.choices[0].message.content}")
```

#### 4.2.2 Reasoning Parser

Step-3.7-Flash supports reasoning mode. Enable the reasoning parser during deployment to separate the thinking and content sections:

```shell Command theme={null}
sglang serve \
  --model-path stepfun-ai/Step-3.7-Flash \
  --tp 4 \
  --trust-remote-code \
  --reasoning-parser step3p5
```

```python Example theme={null}
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# Enable streaming to see the thinking process in real-time
response = client.chat.completions.create(
    model="stepfun-ai/Step-3.7-Flash",
    messages=[
        {"role": "user", "content": "Solve this problem step by step: What is 15% of 240?"}
    ],
    temperature=0.7,
    max_tokens=2048,
    stream=True
)

# Process the stream
has_thinking = False
has_answer = False
thinking_started = False

for chunk in response:
    if chunk.choices and len(chunk.choices) > 0:
        delta = chunk.choices[0].delta

        # Print thinking process
        if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
            if not thinking_started:
                print("=============== Thinking =================", flush=True)
                thinking_started = True
            has_thinking = True
            print(delta.reasoning_content, end="", flush=True)

        # Print answer content
        if delta.content:
            # Close thinking section and add content header
            if has_thinking and not has_answer:
                print("\n=============== Content =================", flush=True)
                has_answer = True
            print(delta.content, end="", flush=True)

print()
```

#### 4.2.3 Tool Calling

Step-3.7-Flash supports tool calling capabilities. Enable the tool call parser:

**Start sglang server:**

```shell Command theme={null}
sglang serve \
  --model-path stepfun-ai/Step-3.7-Flash \
  --tp 4 \
  --trust-remote-code \
  --reasoning-parser step3p5 \
  --tool-call-parser step3p5
```

```python Example theme={null}
from openai import OpenAI
import json

client = OpenAI(
    base_url="http://localhost:30000/v1",
    api_key="EMPTY"
)

# 1. define tools
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {"type": "string", "description": "The city name"},
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"], "description": "Temperature unit"}
                },
                "required": ["location"]
            }
        }
    }
]

# 2. tool run
def get_weather(location, unit="celsius"):
    return f"The weather in {location} is 22 {unit[0].upper()} and sunny."

# 3. send first request
print("--- Sending first request ---")
response = client.chat.completions.create(
    model="stepfun-ai/Step-3.7-Flash",
    messages=[
        {"role": "user", "content": "What's the weather in Beijing?"}
    ],
    tools=tools,
    temperature=1.0,
    stream=False
)

message = response.choices[0].message

# 4. Handle Reasoning Content
reasoning = getattr(message, 'reasoning_content', None)
if reasoning:
    print("=============== Thinking =================")
    print(reasoning)
    print("==========================================")

# 5. Handle Tool Calls
if message.tool_calls:
    print("\nTool Calls detected:")
    history_messages = [
        {"role": "user", "content": "What's the weather in Beijing?"},
        message
    ]

    for tool_call in message.tool_calls:
        print(f"   Tool: {tool_call.function.name}")
        print(f"   Args: {tool_call.function.arguments}")

        args = json.loads(tool_call.function.arguments)
        tool_result = get_weather(args.get("location"), args.get("unit", "celsius"))

        history_messages.append({
            "role": "tool",
            "tool_call_id": tool_call.id,
            "content": tool_result
        })

    print("\n--- Sending tool results ---")
    final_response = client.chat.completions.create(
        model="stepfun-ai/Step-3.7-Flash",
        messages=history_messages,
        temperature=1.0,
        stream=False
    )

    print("=============== Final Content =================")
    print(final_response.choices[0].message.content)

else:
    if message.content:
        print("=============== Content =================")
        print(message.content)
```

**Note:**

* The reasoning parser shows how the model decides to use a tool
* Tool calls are clearly marked with the function name and arguments
* You can then execute the function and send the result back to continue the conversation

## 5. Benchmark

*Benchmark results will be added soon.*
