vLLM MEMO

开发常用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# init env
cd ~/github/vllm/
source .venv/bin/activate
export VLLM_USE_MODELSCOPE=False
export HF_ENDPOINT="https://hf-mirror.com"
# export VLLM_USE_MODELSCOPE=True
pre-commit install

# def run_qwen2_5_vl
# /home/sss/.cache/modelscope/hub/models/Qwen/Qwen2.5-VL-7B-Instruct
python examples/offline_inference/vision_language.py -m qwen2_5_vl

# def run_qwen3_vl
# /home/sss/.cache/modelscope/hub/models/Qwen/Qwen3-VL-4B-Instruct
# /home/sss/.cache/modelscope/hub/models/Qwen/Qwen3-VL-8B-Instruct
python examples/offline_inference/vision_language.py -m qwen3_vl --modality "video"

# def run_dots_ocr
# /home/sss/.cache/huggingface/hub/models/dots_ocr
python examples/offline_inference/vision_language.py -m dots_ocr

# def run_deepseek_ocr
# /home/sss/.cache/modelscope/hub/models/deepseek-ai/DeepSeek-OCR
python examples/offline_inference/vision_language.py -m deepseek_ocr
1
2
3
4
5
# vllm
git sync
git fetch --tags
git tag -l | grep v0.18.0
git checkout -b v0.18.0-dev v0.18.0

Models

1
2
3
4
5
6
7
# A100
/mnt/sfs_turbo/models/modelscope/models/Qwen/Qwen3-VL-8B-Instruct
/mnt/sfs_turbo/models/modelscope/models/Qwen/Qwen3-VL-32B-Instruct

# A100 GPU Coder
/home/sss/.cache/modelscope/hub/models/Qwen/Qwen3-VL-4B-Instruct
/shared/models/modelscope/models/Qwen/Qwen3-VL-32B-Instruct

Launch Args

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# 环境变量
export VLLM_USE_V1=1
export VLLM_WORKER_MULTIPROC_METHOD="spawn"
export VLLM_USE_MODELSCOPE=True

# 启动参数(离线)
model="Qwen/QwQ-32B"
tensor_parallel_size=2
pipeline_parallel_size=2
distributed_executor_backend="mp"
max_model_len=4096 # Limit context window
max_num_seqs=4 # Limit batch size
enforce_eager=True
trust_remote_code=True
gpu_memory_utilization=0.9

# 启动参数(在线)
vllm serve Qwen/Qwen3-8B \
--max_model_len 16384 \
--max-num-batched-tokens 16384 \
--dtype bfloat16 \
--enforce-eager \
--trust-remote-code \
--enable-expert-parallel \
--no-enable-expert-parallel \

# MM related:

Debug

1
VLLM_LOGGING_LEVEL=DEBUG vllm serve ...
1
2
3
4
from vllm.logger import init_logger
logger = init_logger(__name__)

logger.info_once(f"...")

Benchmark Datasets

下载 huggingface 工具:

1
2
uv pip install huggingface_hub -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
export HF_ENDPOINT=https://hf-mirror.com

下载数据集:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# Default path:
# ~/.cache/huggingface/datasets/
# ├── dataset_name/
# │ └── config_name/
# │ └── version/
# │ └── hash/
# │ ├── dataset_info.json
# │ ├── dataset.arrow
# │ └── ...

# lmarena-ai/VisionArena-Chat (too big)
# lmarena-ai/vision-arena-bench-v0.1 (small version)
import os
from huggingface_hub import snapshot_download


# os.environ['HF_DATASETS_CACHE'] = '/home/sss/.cache/huggingface/datasets'
# os.environ['HF_DATASETS_CACHE'] = '/shared/datasets'
snapshot_download(
repo_id="lmarena-ai/vision-arena-bench-v0.1",
repo_type="dataset",
cache_dir="/shared/sss/datasets",
)

Commands

1
2
3
4
5
6
7
8
9
# vllm-ascend format
yapf -i <file>
isort <file>
ruff check <file>

# Clear process
ps -ef | grep vllm | cut -c 9-16 | xargs kill -9
ps -ef | grep VLLM | cut -c 9-16 | xargs kill -9
ps -ef | grep python | cut -c 9-16 | xargs kill -9

Structured Output

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# e2e test
pytest -sv \
tests/v1/entrypoints/llm/test_struct_output_generate.py::test_structured_output
pytest -sv \
tests/e2e/singlecard/test_guided_decoding.py::test_guided_regex
# v0.9.1
pytest -sv tests/singlecard/test_guided_decoding.py


# Benchmark (with thinking disabled)
# 1.
vllm serve Qwen/Qwen3-1.7B \
--no-enable-prefix-caching
# 2.
python benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model Qwen/Qwen3-1.7B \
--dataset json \
--structured-output-ratio 1.0 \
--request-rate 1000 \
--num-prompts 2000
# or:
python3 benchmarks/benchmark_serving_structured_output.py \
--backend vllm \
--model Qwen/Qwen3-1.7B \
--structured-output-ratio 1.0 \
--request-rate 100 \
--num-prompts 2000 \
--json-schema-path ./test3.json \
--output-len 2048


vllm serve /root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct \
--max_model_len 26240 \
--pipeline-parallel-size 2

curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "/root/.cache/modelscope/hub/models/Qwen/Qwen2.5-0.5B-Instruct",
"prompt": "Hello, my name is",
"max_tokens": 7,
"temperature": 0
}'

Spec Decode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
pytest -sv \
tests/long_term/spec_decode/e2e/test_v1_spec_decode.py::test_ngram_correctness

VLLM_USE_V1=0 pytest -sv \
tests/e2e/long_term/spec_decode_v0/e2e/test_ngram_correctness.py::test_ngram_e2e_greedy_correctness

# Ngram
python /home/sss/github/vllm/examples/offline_inference/spec_decode.py \
--method ngram \
--num-spec-tokens 2 \
--prompt-lookup-max 5 \
--prompt-lookup-min 3 \
--model-dir /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \
-tp 2
# Online
vllm serve /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \
--max-model-len 1024 \
--speculative-config '{"method": "ngram", "num_speculative_tokens": 3, "prompt_lookup_max": 5, "prompt_lookup_min": 3}' \
--gpu_memory_utilization 0.9 \
--trust-remote-code \
--enforce-eager \
-tp 2

# Eagel 3
python /home/sss/github/vllm/examples/offline_inference/spec_decode.py \
--method eagle3 \
--num-spec-tokens 2 \
--model-dir /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \
--eagle-dir /home/sss/models/models/models/vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B
# ValueError: Speculative tokens > 2 are not supported yet.
# Online
vllm serve /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \
--max-model-len 1024 \
--speculative-config '{"method": "eagle3", "num_speculative_tokens": 2, "max_model_len": 128, "model": "/home/sss/models/models/models/vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"}' \
--gpu_memory_utilization 0.9 \
--trust-remote-code \
--enforce-eager \
-tp 2

vllm serve /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \
--max-model-len 1024 \
--speculative-config '{"method": "eagle", "num_speculative_tokens": 2, "model": "/home/sss/models/models/models/vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"}' \
--gpu_memory_utilization 0.9 \
--trust-remote-code \
--enforce-eager \
-tp 2

# MTP
# Doc: https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html
vllm serve /mnt/sfs_turbo/ascend-ci-share-nv-action-vllm-benchmarks/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8 \
--max-model-len 1024 \
--max-num-seqs 16 \
--no-enable-prefix-caching \
--tensor-parallel-size 4 \
--data_parallel_size 4 \
--enable_expert_parallel \
--speculative-config '{"method":"deepseek_mtp", "num_speculative_tokens": 1}' \
--quantization ascend \
--additional-config '{"ascend_scheduler_config": {"enabled": true, "enable_chunked_prefill": false}, "torchair_graph_config": {"enabled": true, "graph_batch_sizes": [16]}, "enable_weight_nz_layout": true}' \
--gpu_memory_utilization 0.9 \
--trust-remote-code
1
2
3
4
5
6
7
8
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "/shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct",
"prompt": "The future of AI is",
"max_tokens": 100,
"temperature": 0
}'

PR Desc

1
2
3
4
> [!NOTE]
>

**🤖 AI Summary:**