1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
| python -m vllm.entrypoints.openai.api_server \ --model /root/.cache/modelscope/hub/models/Qwen/Qwen3-VL-8B-Instruct \ --dtype bfloat16 \ --gpu-memory-utilization 0.9 \ --max-model-len 8192 \ --max_num_batched_tokens 8192
--host 0.0.0.0 \ --port 8000 \ --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]}'
curl -X POST http://localhost:8000/v1/chat/completions \ -H "Content-Type: application/json" \ -d '{ "model": "/root/.cache/modelscope/hub/models/Qwen/Qwen3-VL-8B-Instruct", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": [ {"type": "image_url", "image_url": {"url": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png"}}, {"type": "text", "text": "What is the text in the illustrate? How does it look?"} ]} ], "max_tokens": 4096 }'
max_completion_tokens
|