Qwen3.5 examples#
Environment Preparation#
Installation#
The dependencies required for the NPU runtime environment have been integrated into a Docker image and uploaded to the quay.io platform. You can directly pull it.
#Atlas 800 A3
docker pull quay.io/ascend/sglang:main-cann8.5.0-a3
#Atlas 800 A2
docker pull quay.io/ascend/sglang:main-cann8.5.0-910b
#start container
docker run -itd --shm-size=16g --privileged=true --name ${NAME} \
--privileged=true --net=host \
-v /var/queue_schedule:/var/queue_schedule \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /usr/local/sbin:/usr/local/sbin \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \
--device=/dev/davinci0:/dev/davinci0 \
--device=/dev/davinci1:/dev/davinci1 \
--device=/dev/davinci2:/dev/davinci2 \
--device=/dev/davinci3:/dev/davinci3 \
--device=/dev/davinci4:/dev/davinci4 \
--device=/dev/davinci5:/dev/davinci5 \
--device=/dev/davinci6:/dev/davinci6 \
--device=/dev/davinci7:/dev/davinci7 \
--device=/dev/davinci8:/dev/davinci8 \
--device=/dev/davinci9:/dev/davinci9 \
--device=/dev/davinci10:/dev/davinci10 \
--device=/dev/davinci11:/dev/davinci11 \
--device=/dev/davinci12:/dev/davinci12 \
--device=/dev/davinci13:/dev/davinci13 \
--device=/dev/davinci14:/dev/davinci14 \
--device=/dev/davinci15:/dev/davinci15 \
--device=/dev/davinci_manager:/dev/davinci_manager \
--device=/dev/hisi_hdc:/dev/hisi_hdc \
--entrypoint=bash \
quay.io/ascend/sglang:${tag}
Deployment#
Single-node Deployment#
Run the following script to execute online inference.
Qwen3.5 397B#
# high performance cpu
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl -w kernel.sched_migration_cost_ns=50000
# bind cpu
export SGLANG_SET_CPU_AFFINITY=1
unset https_proxy
unset http_proxy
unset HTTPS_PROXY
unset HTTP_PROXY
unset ASCEND_LAUNCH_BLOCKING
# cann
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export STREAMS_PER_DEVICE=32
export HCCL_BUFFSIZE=1000
export HCCL_OP_EXPANSION_MODE=AIV
export HCCL_SOCKET_IFNAME=lo
export GLOO_SOCKET_IFNAME=lo
python3 -m sglang.launch_server \
--model-path $MODEL_PATH \
--attention-backend ascend \
--device npu \
--tp-size 16 --nnodes 1 --node-rank 0 \
--chunked-prefill-size 4096 --max-prefill-tokens 280000 \
--disable-radix-cache \
--trust-remote-code \
--host 127.0.0.1 \
--mem-fraction-static 0.7 \
--port 8000 \
--cuda-graph-bs 16 \
--quantization modelslim \
--enable-multimodal \
--mm-attention-backend ascend_attn \
--dtype bfloat16
Qwen3.5 122B#
# high performance cpu
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl -w kernel.sched_migration_cost_ns=50000
# bind cpu
export SGLANG_SET_CPU_AFFINITY=1
unset https_proxy
unset http_proxy
unset HTTPS_PROXY
unset HTTP_PROXY
unset ASCEND_LAUNCH_BLOCKING
# cann
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export STREAMS_PER_DEVICE=32
export HCCL_BUFFSIZE=1000
export HCCL_OP_EXPANSION_MODE=AIV
export HCCL_SOCKET_IFNAME=lo
export GLOO_SOCKET_IFNAME=lo
python3 -m sglang.launch_server \
--model-path $MODEL_PATH \
--attention-backend ascend \
--device npu \
--tp-size 8 --nnodes 1 --node-rank 0 \
--chunked-prefill-size 4096 --max-prefill-tokens 280000 \
--disable-radix-cache \
--trust-remote-code \
--host 127.0.0.1 \
--mem-fraction-static 0.7 \
--port 8000 \
--cuda-graph-bs 16 \
--quantization modelslim \
--enable-multimodal \
--mm-attention-backend ascend_attn \
--dtype bfloat16
Qwen3.5 35B#
# high performance cpu
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl -w kernel.sched_migration_cost_ns=50000
# bind cpu
export SGLANG_SET_CPU_AFFINITY=1
unset https_proxy
unset http_proxy
unset HTTPS_PROXY
unset HTTP_PROXY
unset ASCEND_LAUNCH_BLOCKING
# cann
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export STREAMS_PER_DEVICE=32
export HCCL_BUFFSIZE=1000
export HCCL_OP_EXPANSION_MODE=AIV
export HCCL_SOCKET_IFNAME=lo
export GLOO_SOCKET_IFNAME=lo
python3 -m sglang.launch_server \
--model-path $MODEL_PATH \
--attention-backend ascend \
--device npu \
--tp-size 2 --nnodes 1 --node-rank 0 \
--chunked-prefill-size 4096 --max-prefill-tokens 280000 \
--disable-radix-cache \
--trust-remote-code \
--host 127.0.0.1 \
--mem-fraction-static 0.7 \
--port 8000 \
--cuda-graph-bs 16 \
--quantization modelslim \
--enable-multimodal \
--mm-attention-backend ascend_attn \
--dtype bfloat16
Qwen3.5 27B#
# high performance cpu
echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
sysctl -w vm.swappiness=0
sysctl -w kernel.numa_balancing=0
sysctl -w kernel.sched_migration_cost_ns=50000
# bind cpu
export SGLANG_SET_CPU_AFFINITY=1
unset https_proxy
unset http_proxy
unset HTTPS_PROXY
unset HTTP_PROXY
unset ASCEND_LAUNCH_BLOCKING
# cann
source /usr/local/Ascend/ascend-toolkit/set_env.sh
source /usr/local/Ascend/nnal/atb/set_env.sh
export STREAMS_PER_DEVICE=32
export HCCL_BUFFSIZE=1000
export HCCL_OP_EXPANSION_MODE=AIV
export HCCL_SOCKET_IFNAME=lo
export GLOO_SOCKET_IFNAME=lo
python3 -m sglang.launch_server \
--model-path $MODEL_PATH \
--attention-backend ascend \
--device npu \
--tp-size 2 \
--chunked-prefill-size -1 --max-prefill-tokens 120000 \
--disable-radix-cache \
--trust-remote-code \
--host 127.0.0.1 \
--mem-fraction-static 0.8 \
--port 8000 \
--cuda-graph-bs 32 \
--enable-multimodal \
--mm-attention-backend ascend_attn
Prefill-Decode Disaggregation#
Not test yet.
Using Benchmark#
Refer to Benchmark and Profiling for details.