1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
| pytest -sv \ tests/long_term/spec_decode/e2e/test_v1_spec_decode.py::test_ngram_correctness
VLLM_USE_V1=0 pytest -sv \ tests/e2e/long_term/spec_decode_v0/e2e/test_ngram_correctness.py::test_ngram_e2e_greedy_correctness
python /home/sss/github/vllm/examples/offline_inference/spec_decode.py \ --method ngram \ --num-spec-tokens 2 \ --prompt-lookup-max 5 \ --prompt-lookup-min 3 \ --model-dir /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \ -tp 2
vllm serve /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \ --max-model-len 1024 \ --speculative-config '{"method": "ngram", "num_speculative_tokens": 3, "prompt_lookup_max": 5, "prompt_lookup_min": 3}' \ --gpu_memory_utilization 0.9 \ --trust-remote-code \ --enforce-eager \ -tp 2
python /home/sss/github/vllm/examples/offline_inference/spec_decode.py \ --method eagle3 \ --num-spec-tokens 2 \ --model-dir /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \ --eagle-dir /home/sss/models/models/models/vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B
vllm serve /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \ --max-model-len 1024 \ --speculative-config '{"method": "eagle3", "num_speculative_tokens": 2, "max_model_len": 128, "model": "/home/sss/models/models/models/vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"}' \ --gpu_memory_utilization 0.9 \ --trust-remote-code \ --enforce-eager \ -tp 2
vllm serve /shared/cache/modelscope/hub/models/LLM-Research/Meta-Llama-3.1-8B-Instruct \ --max-model-len 1024 \ --speculative-config '{"method": "eagle", "num_speculative_tokens": 2, "model": "/home/sss/models/models/models/vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"}' \ --gpu_memory_utilization 0.9 \ --trust-remote-code \ --enforce-eager \ -tp 2
vllm serve /mnt/sfs_turbo/ascend-ci-share-nv-action-vllm-benchmarks/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8 \ --max-model-len 1024 \ --max-num-seqs 16 \ --no-enable-prefix-caching \ --tensor-parallel-size 4 \ --data_parallel_size 4 \ --enable_expert_parallel \ --speculative-config '{"method":"deepseek_mtp", "num_speculative_tokens": 1}' \ --quantization ascend \ --additional-config '{"ascend_scheduler_config": {"enabled": true, "enable_chunked_prefill": false}, "torchair_graph_config": {"enabled": true, "graph_batch_sizes": [16]}, "enable_weight_nz_layout": true}' \ --gpu_memory_utilization 0.9 \ --trust-remote-code
|